2748 lines
110 KiB
C++
2748 lines
110 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "log.hh"
|
|
#include <vector>
|
|
#include <typeinfo>
|
|
#include <limits>
|
|
#include "core/future.hh"
|
|
#include "core/future-util.hh"
|
|
#include "core/sstring.hh"
|
|
#include "core/fstream.hh"
|
|
#include "core/shared_ptr.hh"
|
|
#include "core/do_with.hh"
|
|
#include "core/thread.hh"
|
|
#include <seastar/core/shared_future.hh>
|
|
#include <seastar/core/byteorder.hh>
|
|
#include <iterator>
|
|
|
|
#include "types.hh"
|
|
#include "sstables.hh"
|
|
#include "compress.hh"
|
|
#include "unimplemented.hh"
|
|
#include "index_reader.hh"
|
|
#include "remove.hh"
|
|
#include "memtable.hh"
|
|
#include "range.hh"
|
|
#include "downsampling.hh"
|
|
#include <boost/filesystem/operations.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/range/adaptor/map.hpp>
|
|
#include <boost/range/adaptor/transformed.hpp>
|
|
#include <boost/range/algorithm_ext/insert.hpp>
|
|
#include <boost/range/algorithm_ext/push_back.hpp>
|
|
#include <boost/range/algorithm/set_algorithm.hpp>
|
|
#include <regex>
|
|
#include <core/align.hh>
|
|
#include "utils/phased_barrier.hh"
|
|
#include "range_tombstone_list.hh"
|
|
#include "counters.hh"
|
|
|
|
#include "checked-file-impl.hh"
|
|
#include "service/storage_service.hh"
|
|
|
|
thread_local disk_error_signal_type sstable_read_error;
|
|
thread_local disk_error_signal_type sstable_write_error;
|
|
|
|
namespace sstables {
|
|
|
|
logging::logger sstlog("sstable");
|
|
|
|
future<file> new_sstable_component_file(const io_error_handler& error_handler, sstring name, open_flags flags) {
|
|
return open_checked_file_dma(error_handler, name, flags).handle_exception([name] (auto ep) {
|
|
sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
|
|
return make_exception_future<file>(ep);
|
|
});
|
|
}
|
|
|
|
future<file> new_sstable_component_file(const io_error_handler& error_handler, sstring name, open_flags flags,
|
|
file_open_options options) {
|
|
return open_checked_file_dma(error_handler, name, flags, options).handle_exception([name] (auto ep) {
|
|
sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
|
|
return make_exception_future<file>(ep);
|
|
});
|
|
}
|
|
|
|
static utils::phased_barrier& background_jobs() {
|
|
static thread_local utils::phased_barrier gate;
|
|
return gate;
|
|
}
|
|
|
|
future<> await_background_jobs() {
|
|
sstlog.debug("Waiting for background jobs");
|
|
return background_jobs().advance_and_await().finally([] {
|
|
sstlog.debug("Waiting done");
|
|
});
|
|
}
|
|
|
|
future<> await_background_jobs_on_all_shards() {
|
|
return smp::invoke_on_all([] {
|
|
return await_background_jobs();
|
|
});
|
|
}
|
|
|
|
class random_access_reader {
|
|
input_stream<char> _in;
|
|
protected:
|
|
virtual input_stream<char> open_at(uint64_t pos) = 0;
|
|
public:
|
|
future<temporary_buffer<char>> read_exactly(size_t n) {
|
|
return _in.read_exactly(n);
|
|
}
|
|
void seek(uint64_t pos) {
|
|
_in = open_at(pos);
|
|
}
|
|
bool eof() { return _in.eof(); }
|
|
virtual future<> close() {
|
|
return _in.close();
|
|
}
|
|
virtual ~random_access_reader() { }
|
|
};
|
|
|
|
class file_random_access_reader : public random_access_reader {
|
|
file _file;
|
|
size_t _buffer_size;
|
|
public:
|
|
virtual input_stream<char> open_at(uint64_t pos) override {
|
|
file_input_stream_options options;
|
|
options.buffer_size = _buffer_size;
|
|
|
|
return make_file_input_stream(_file, pos, std::move(options));
|
|
}
|
|
explicit file_random_access_reader(file f, size_t buffer_size = 8192)
|
|
: _file(std::move(f)), _buffer_size(buffer_size)
|
|
{
|
|
seek(0);
|
|
}
|
|
virtual future<> close() override {
|
|
return random_access_reader::close().finally([this] {
|
|
return _file.close().handle_exception([save = _file] (auto ep) {
|
|
sstlog.warn("sstable close failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
});
|
|
}
|
|
};
|
|
|
|
std::unordered_map<sstable::version_types, sstring, enum_hash<sstable::version_types>> sstable::_version_string = {
|
|
{ sstable::version_types::ka , "ka" },
|
|
{ sstable::version_types::la , "la" }
|
|
};
|
|
|
|
std::unordered_map<sstable::format_types, sstring, enum_hash<sstable::format_types>> sstable::_format_string = {
|
|
{ sstable::format_types::big , "big" }
|
|
};
|
|
|
|
static const sstring TOC_SUFFIX = "TOC.txt";
|
|
static const sstring TEMPORARY_TOC_SUFFIX = "TOC.txt.tmp";
|
|
|
|
// FIXME: this should be version-dependent
|
|
std::unordered_map<sstable::component_type, sstring, enum_hash<sstable::component_type>> sstable::_component_map = {
|
|
{ component_type::Index, "Index.db"},
|
|
{ component_type::CompressionInfo, "CompressionInfo.db" },
|
|
{ component_type::Data, "Data.db" },
|
|
{ component_type::TOC, TOC_SUFFIX },
|
|
{ component_type::Summary, "Summary.db" },
|
|
{ component_type::Digest, "Digest.sha1" },
|
|
{ component_type::CRC, "CRC.db" },
|
|
{ component_type::Filter, "Filter.db" },
|
|
{ component_type::Statistics, "Statistics.db" },
|
|
{ component_type::Scylla, "Scylla.db" },
|
|
{ component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX },
|
|
{ component_type::TemporaryStatistics, "Statistics.db.tmp" },
|
|
};
|
|
|
|
// This assumes that the mappings are small enough, and called unfrequent
|
|
// enough. If that changes, it would be adviseable to create a full static
|
|
// reverse mapping, even if it is done at runtime.
|
|
template <typename Map>
|
|
static typename Map::key_type reverse_map(const typename Map::mapped_type& value, Map& map) {
|
|
for (auto& pair: map) {
|
|
if (pair.second == value) {
|
|
return pair.first;
|
|
}
|
|
}
|
|
throw std::out_of_range("unable to reverse map");
|
|
}
|
|
|
|
// This should be used every time we use read_exactly directly.
|
|
//
|
|
// read_exactly is a lot more convenient of an interface to use, because we'll
|
|
// be parsing known quantities.
|
|
//
|
|
// However, anything other than the size we have asked for, is certainly a bug,
|
|
// and we need to do something about it.
|
|
static void check_buf_size(temporary_buffer<char>& buf, size_t expected) {
|
|
if (buf.size() < expected) {
|
|
throw bufsize_mismatch_exception(buf.size(), expected);
|
|
}
|
|
}
|
|
|
|
template <typename T, typename U>
|
|
static void check_truncate_and_assign(T& to, const U from) {
|
|
static_assert(std::is_integral<T>::value && std::is_integral<U>::value, "T and U must be integral");
|
|
to = from;
|
|
if (to != from) {
|
|
throw std::overflow_error("assigning U to T caused an overflow");
|
|
}
|
|
}
|
|
|
|
// Base parser, parses an integer type
|
|
template <typename T>
|
|
typename std::enable_if_t<std::is_integral<T>::value, void>
|
|
read_integer(temporary_buffer<char>& buf, T& i) {
|
|
auto *nr = reinterpret_cast<const net::packed<T> *>(buf.get());
|
|
i = net::ntoh(*nr);
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if_t<std::is_integral<T>::value, future<>>
|
|
parse(random_access_reader& in, T& i) {
|
|
return in.read_exactly(sizeof(T)).then([&i] (auto buf) {
|
|
check_buf_size(buf, sizeof(T));
|
|
|
|
read_integer(buf, i);
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
template <typename T>
|
|
inline typename std::enable_if_t<std::is_integral<T>::value, void>
|
|
write(file_writer& out, T i) {
|
|
auto *nr = reinterpret_cast<const net::packed<T> *>(&i);
|
|
i = net::hton(*nr);
|
|
auto p = reinterpret_cast<const char*>(&i);
|
|
out.write(p, sizeof(T)).get();
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if_t<std::is_enum<T>::value, future<>>
|
|
parse(random_access_reader& in, T& i) {
|
|
return parse(in, reinterpret_cast<typename std::underlying_type<T>::type&>(i));
|
|
}
|
|
|
|
template <typename T>
|
|
inline typename std::enable_if_t<std::is_enum<T>::value, void>
|
|
write(file_writer& out, T i) {
|
|
write(out, static_cast<typename std::underlying_type<T>::type>(i));
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, bool& i) {
|
|
return parse(in, reinterpret_cast<uint8_t&>(i));
|
|
}
|
|
|
|
inline void write(file_writer& out, bool i) {
|
|
write(out, static_cast<uint8_t>(i));
|
|
}
|
|
|
|
template <typename To, typename From>
|
|
static inline To convert(From f) {
|
|
static_assert(sizeof(To) == sizeof(From), "Sizes must match");
|
|
union {
|
|
To to;
|
|
From from;
|
|
} conv;
|
|
|
|
conv.from = f;
|
|
return conv.to;
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, double& d) {
|
|
return in.read_exactly(sizeof(double)).then([&d] (auto buf) {
|
|
check_buf_size(buf, sizeof(double));
|
|
|
|
auto *nr = reinterpret_cast<const net::packed<unsigned long> *>(buf.get());
|
|
d = convert<double>(net::ntoh(*nr));
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, double d) {
|
|
auto *nr = reinterpret_cast<const net::packed<unsigned long> *>(&d);
|
|
auto tmp = net::hton(*nr);
|
|
auto p = reinterpret_cast<const char*>(&tmp);
|
|
out.write(p, sizeof(unsigned long)).get();
|
|
}
|
|
|
|
template <typename T>
|
|
future<> parse(random_access_reader& in, T& len, bytes& s) {
|
|
return in.read_exactly(len).then([&s, len] (auto buf) {
|
|
check_buf_size(buf, len);
|
|
// Likely a different type of char. Most bufs are unsigned, whereas the bytes type is signed.
|
|
s = bytes(reinterpret_cast<const bytes::value_type *>(buf.get()), len);
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, const bytes& s) {
|
|
out.write(s).get();
|
|
}
|
|
|
|
inline void write(file_writer& out, bytes_view s) {
|
|
out.write(reinterpret_cast<const char*>(s.data()), s.size()).get();
|
|
}
|
|
|
|
inline void write(file_writer& out, bytes_ostream s) {
|
|
for (bytes_view fragment : s) {
|
|
write(out, fragment);
|
|
}
|
|
}
|
|
|
|
// All composite parsers must come after this
|
|
template<typename First, typename... Rest>
|
|
future<> parse(random_access_reader& in, First& first, Rest&&... rest) {
|
|
return parse(in, first).then([&in, &rest...] {
|
|
return parse(in, std::forward<Rest>(rest)...);
|
|
});
|
|
}
|
|
|
|
template<typename First, typename... Rest>
|
|
inline void write(file_writer& out, const First& first, Rest&&... rest) {
|
|
write(out, first);
|
|
write(out, std::forward<Rest>(rest)...);
|
|
}
|
|
|
|
// Intended to be used for a type that describes itself through describe_type().
|
|
template <class T>
|
|
typename std::enable_if_t<!std::is_integral<T>::value && !std::is_enum<T>::value, future<>>
|
|
parse(random_access_reader& in, T& t) {
|
|
return t.describe_type([&in] (auto&&... what) -> future<> {
|
|
return parse(in, what...);
|
|
});
|
|
}
|
|
|
|
template <class T>
|
|
inline typename std::enable_if_t<!std::is_integral<T>::value && !std::is_enum<T>::value, void>
|
|
write(file_writer& out, const T& t) {
|
|
// describe_type() is not const correct, so cheat here:
|
|
const_cast<T&>(t).describe_type([&out] (auto&&... what) -> void {
|
|
write(out, std::forward<decltype(what)>(what)...);
|
|
});
|
|
}
|
|
|
|
// For all types that take a size, we provide a template that takes the type
|
|
// alone, and another, separate one, that takes a size parameter as well, of
|
|
// type Size. This is because although most of the time the size and the data
|
|
// are contiguous, it is not always the case. So we want to have the
|
|
// flexibility of parsing them separately.
|
|
template <typename Size>
|
|
future<> parse(random_access_reader& in, disk_string<Size>& s) {
|
|
auto len = std::make_unique<Size>();
|
|
auto f = parse(in, *len);
|
|
return f.then([&in, &s, len = std::move(len)] {
|
|
return parse(in, *len, s.value);
|
|
});
|
|
}
|
|
|
|
template <typename Size>
|
|
inline void write(file_writer& out, const disk_string<Size>& s) {
|
|
Size len = 0;
|
|
check_truncate_and_assign(len, s.value.size());
|
|
write(out, len);
|
|
write(out, s.value);
|
|
}
|
|
|
|
template <typename Size>
|
|
inline void write(file_writer& out, const disk_string_view<Size>& s) {
|
|
Size len;
|
|
check_truncate_and_assign(len, s.value.size());
|
|
write(out, len, s.value);
|
|
}
|
|
|
|
// We cannot simply read the whole array at once, because we don't know its
|
|
// full size. We know the number of elements, but if we are talking about
|
|
// disk_strings, for instance, we have no idea how much of the stream each
|
|
// element will take.
|
|
//
|
|
// Sometimes we do know the size, like the case of integers. There, all we have
|
|
// to do is to convert each member because they are all stored big endian.
|
|
// We'll offer a specialization for that case below.
|
|
template <typename Size, typename Members>
|
|
typename std::enable_if_t<!std::is_integral<Members>::value, future<>>
|
|
parse(random_access_reader& in, Size& len, std::deque<Members>& arr) {
|
|
|
|
auto count = make_lw_shared<size_t>(0);
|
|
auto eoarr = [count, len] { return *count == len; };
|
|
|
|
return do_until(eoarr, [count, &in, &arr] {
|
|
return parse(in, arr[(*count)++]);
|
|
});
|
|
}
|
|
|
|
template <typename Size, typename Members>
|
|
typename std::enable_if_t<std::is_integral<Members>::value, future<>>
|
|
parse(random_access_reader& in, Size& len, std::deque<Members>& arr) {
|
|
auto done = make_lw_shared<size_t>(0);
|
|
return repeat([&in, &len, &arr, done] {
|
|
auto now = std::min(len - *done, 100000 / sizeof(Members));
|
|
return in.read_exactly(now * sizeof(Members)).then([&arr, len, now, done] (auto buf) {
|
|
check_buf_size(buf, now * sizeof(Members));
|
|
|
|
auto *nr = reinterpret_cast<const net::packed<Members> *>(buf.get());
|
|
for (size_t i = 0; i < now; ++i) {
|
|
arr[*done + i] = net::ntoh(nr[i]);
|
|
}
|
|
*done += now;
|
|
return make_ready_future<stop_iteration>(*done == len ? stop_iteration::yes : stop_iteration::no);
|
|
});
|
|
});
|
|
}
|
|
|
|
// We resize the array here, before we pass it to the integer / non-integer
|
|
// specializations
|
|
template <typename Size, typename Members>
|
|
future<> parse(random_access_reader& in, disk_array<Size, Members>& arr) {
|
|
auto len = make_lw_shared<Size>();
|
|
auto f = parse(in, *len);
|
|
return f.then([&in, &arr, len] {
|
|
arr.elements.resize(*len);
|
|
return parse(in, *len, arr.elements);
|
|
}).finally([len] {});
|
|
}
|
|
|
|
template <typename Members>
|
|
inline typename std::enable_if_t<!std::is_integral<Members>::value, void>
|
|
write(file_writer& out, const std::deque<Members>& arr) {
|
|
for (auto& a : arr) {
|
|
write(out, a);
|
|
}
|
|
}
|
|
|
|
template <typename Members>
|
|
inline typename std::enable_if_t<std::is_integral<Members>::value, void>
|
|
write(file_writer& out, const std::deque<Members>& arr) {
|
|
std::vector<Members> tmp;
|
|
size_t per_loop = 100000 / sizeof(Members);
|
|
tmp.resize(per_loop);
|
|
size_t idx = 0;
|
|
while (idx != arr.size()) {
|
|
auto now = std::min(arr.size() - idx, per_loop);
|
|
// copy arr into tmp converting each entry into big-endian representation.
|
|
auto nr = arr.begin() + idx;
|
|
for (size_t i = 0; i < now; i++) {
|
|
tmp[i] = net::hton(nr[i]);
|
|
}
|
|
auto p = reinterpret_cast<const char*>(tmp.data());
|
|
auto bytes = now * sizeof(Members);
|
|
out.write(p, bytes).get();
|
|
idx += now;
|
|
}
|
|
}
|
|
|
|
template <typename Size, typename Members>
|
|
inline void write(file_writer& out, const disk_array<Size, Members>& arr) {
|
|
Size len = 0;
|
|
check_truncate_and_assign(len, arr.elements.size());
|
|
write(out, len);
|
|
write(out, arr.elements);
|
|
}
|
|
|
|
template <typename Size, typename Key, typename Value>
|
|
future<> parse(random_access_reader& in, Size& len, std::unordered_map<Key, Value>& map) {
|
|
return do_with(Size(), [&in, len, &map] (Size& count) {
|
|
auto eos = [len, &count] { return len == count++; };
|
|
return do_until(eos, [len, &in, &map] {
|
|
struct kv {
|
|
Key key;
|
|
Value value;
|
|
};
|
|
|
|
return do_with(kv(), [&in, &map] (auto& el) {
|
|
return parse(in, el.key, el.value).then([&el, &map] {
|
|
map.emplace(el.key, el.value);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
template <typename Size, typename Key, typename Value>
|
|
future<> parse(random_access_reader& in, disk_hash<Size, Key, Value>& h) {
|
|
auto w = std::make_unique<Size>();
|
|
auto f = parse(in, *w);
|
|
return f.then([&in, &h, w = std::move(w)] {
|
|
return parse(in, *w, h.map);
|
|
});
|
|
}
|
|
|
|
template <typename Key, typename Value>
|
|
inline void write(file_writer& out, const std::unordered_map<Key, Value>& map) {
|
|
for (auto& val: map) {
|
|
write(out, val.first, val.second);
|
|
};
|
|
}
|
|
|
|
template <typename Size, typename Key, typename Value>
|
|
inline void write(file_writer& out, const disk_hash<Size, Key, Value>& h) {
|
|
Size len = 0;
|
|
check_truncate_and_assign(len, h.map.size());
|
|
write(out, len);
|
|
write(out, h.map);
|
|
}
|
|
|
|
// Abstract parser/sizer/writer for a single tagged member of a tagged union
|
|
template <typename DiskSetOfTaggedUnion>
|
|
struct single_tagged_union_member_serdes {
|
|
using value_type = typename DiskSetOfTaggedUnion::value_type;
|
|
virtual ~single_tagged_union_member_serdes() {}
|
|
virtual future<> do_parse(random_access_reader& in, value_type& v) const = 0;
|
|
virtual uint32_t do_size(const value_type& v) const = 0;
|
|
virtual void do_write(file_writer& out, const value_type& v) const = 0;
|
|
};
|
|
|
|
// Concrete parser for a single member of a tagged union; parses type "Member"
|
|
template <typename DiskSetOfTaggedUnion, typename Member>
|
|
struct single_tagged_union_member_serdes_for final : single_tagged_union_member_serdes<DiskSetOfTaggedUnion> {
|
|
using base = single_tagged_union_member_serdes<DiskSetOfTaggedUnion>;
|
|
using value_type = typename base::value_type;
|
|
virtual future<> do_parse(random_access_reader& in, value_type& v) const {
|
|
v = Member();
|
|
return parse(in, boost::get<Member>(v).value);
|
|
}
|
|
virtual uint32_t do_size(const value_type& v) const override {
|
|
return serialized_size(boost::get<Member>(v).value);
|
|
}
|
|
virtual void do_write(file_writer& out, const value_type& v) const override {
|
|
write(out, boost::get<Member>(v).value);
|
|
}
|
|
};
|
|
|
|
template <typename TagType, typename... Members>
|
|
struct disk_set_of_tagged_union<TagType, Members...>::serdes {
|
|
using disk_set = disk_set_of_tagged_union<TagType, Members...>;
|
|
// We can't use unique_ptr, because we initialize from an std::intializer_list, which is not move compatible.
|
|
using serdes_map_type = std::unordered_map<TagType, shared_ptr<single_tagged_union_member_serdes<disk_set>>, typename disk_set::hash_type>;
|
|
using value_type = typename disk_set::value_type;
|
|
serdes_map_type map = {
|
|
{Members::tag(), make_shared<single_tagged_union_member_serdes_for<disk_set, Members>>()}...
|
|
};
|
|
future<> lookup_and_parse(random_access_reader& in, TagType tag, uint32_t& size, disk_set& s, value_type& value) const {
|
|
auto i = map.find(tag);
|
|
if (i == map.end()) {
|
|
return in.read_exactly(size).discard_result();
|
|
} else {
|
|
return i->second->do_parse(in, value).then([tag, &s, &value] () mutable {
|
|
s.data.emplace(tag, std::move(value));
|
|
});
|
|
}
|
|
}
|
|
uint32_t lookup_and_size(TagType tag, const value_type& value) const {
|
|
return map.at(tag)->do_size(value);
|
|
}
|
|
void lookup_and_write(file_writer& out, TagType tag, const value_type& value) const {
|
|
return map.at(tag)->do_write(out, value);
|
|
}
|
|
};
|
|
|
|
template <typename TagType, typename... Members>
|
|
typename disk_set_of_tagged_union<TagType, Members...>::serdes disk_set_of_tagged_union<TagType, Members...>::s_serdes;
|
|
|
|
template <typename TagType, typename... Members>
|
|
future<>
|
|
parse(random_access_reader& in, disk_set_of_tagged_union<TagType, Members...>& s) {
|
|
using disk_set = disk_set_of_tagged_union<TagType, Members...>;
|
|
using key_type = typename disk_set::key_type;
|
|
using value_type = typename disk_set::value_type;
|
|
return do_with(0u, 0u, 0u, value_type{}, [&] (key_type& nr_elements, key_type& new_key, unsigned& new_size, value_type& new_value) {
|
|
return parse(in, nr_elements).then([&] {
|
|
auto rng = boost::irange<key_type>(0, nr_elements); // do_for_each doesn't like an rvalue range
|
|
return do_for_each(rng.begin(), rng.end(), [&] (key_type ignore) {
|
|
return parse(in, new_key).then([&] {
|
|
return parse(in, new_size).then([&] {
|
|
return disk_set::s_serdes.lookup_and_parse(in, TagType(new_key), new_size, s, new_value);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
template <typename TagType, typename... Members>
|
|
void write(file_writer& out, const disk_set_of_tagged_union<TagType, Members...>& s) {
|
|
using disk_set = disk_set_of_tagged_union<TagType, Members...>;
|
|
write(out, uint32_t(s.data.size()));
|
|
for (auto&& kv : s.data) {
|
|
auto&& tag = kv.first;
|
|
auto&& value = kv.second;
|
|
write(out, tag);
|
|
write(out, uint32_t(disk_set::s_serdes.lookup_and_size(tag, value)));
|
|
disk_set::s_serdes.lookup_and_write(out, tag, value);
|
|
}
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, summary& s) {
|
|
using pos_type = typename decltype(summary::positions)::value_type;
|
|
|
|
return parse(in, s.header.min_index_interval,
|
|
s.header.size,
|
|
s.header.memory_size,
|
|
s.header.sampling_level,
|
|
s.header.size_at_full_sampling).then([&in, &s] {
|
|
return in.read_exactly(s.header.size * sizeof(pos_type)).then([&in, &s] (auto buf) {
|
|
auto len = s.header.size * sizeof(pos_type);
|
|
check_buf_size(buf, len);
|
|
|
|
s.entries.resize(s.header.size);
|
|
|
|
auto *nr = reinterpret_cast<const pos_type *>(buf.get());
|
|
s.positions = std::deque<pos_type>(nr, nr + s.header.size);
|
|
|
|
// Since the keys in the index are not sized, we need to calculate
|
|
// the start position of the index i+1 to determine the boundaries
|
|
// of index i. The "memory_size" field in the header determines the
|
|
// total memory used by the map, so if we push it to the vector, we
|
|
// can guarantee that no conditionals are used, and we can always
|
|
// query the position of the "next" index.
|
|
s.positions.push_back(s.header.memory_size);
|
|
}).then([&in, &s] {
|
|
in.seek(sizeof(summary::header) + s.header.memory_size);
|
|
return parse(in, s.first_key, s.last_key);
|
|
}).then([&in, &s] {
|
|
|
|
in.seek(s.positions[0] + sizeof(summary::header));
|
|
|
|
assert(s.positions.size() == (s.entries.size() + 1));
|
|
|
|
auto idx = make_lw_shared<size_t>(0);
|
|
return do_for_each(s.entries.begin(), s.entries.end(), [idx, &in, &s] (auto& entry) {
|
|
auto pos = s.positions[(*idx)++];
|
|
auto next = s.positions[*idx];
|
|
|
|
auto entrysize = next - pos;
|
|
|
|
return in.read_exactly(entrysize).then([&entry, entrysize] (auto buf) {
|
|
check_buf_size(buf, entrysize);
|
|
|
|
auto keysize = entrysize - 8;
|
|
entry.key = bytes(reinterpret_cast<const int8_t*>(buf.get()), keysize);
|
|
buf.trim_front(keysize);
|
|
// FIXME: This is a le read. We should make this explicit
|
|
entry.position = *(reinterpret_cast<const net::packed<uint64_t> *>(buf.get()));
|
|
|
|
return make_ready_future<>();
|
|
});
|
|
}).then([&s] {
|
|
// Delete last element which isn't part of the on-disk format.
|
|
s.positions.pop_back();
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, const summary_entry& entry) {
|
|
// FIXME: summary entry is supposedly written in memory order, but that
|
|
// would prevent portability of summary file between machines of different
|
|
// endianness. We can treat it as little endian to preserve portability.
|
|
write(out, entry.key);
|
|
auto p = reinterpret_cast<const char*>(&entry.position);
|
|
out.write(p, sizeof(uint64_t)).get();
|
|
}
|
|
|
|
inline void write(file_writer& out, const summary& s) {
|
|
// NOTE: positions and entries must be stored in NATIVE BYTE ORDER, not BIG-ENDIAN.
|
|
write(out, s.header.min_index_interval,
|
|
s.header.size,
|
|
s.header.memory_size,
|
|
s.header.sampling_level,
|
|
s.header.size_at_full_sampling);
|
|
for (auto&& e : s.positions) {
|
|
out.write(reinterpret_cast<const char*>(&e), sizeof(e)).get();
|
|
}
|
|
write(out, s.entries);
|
|
write(out, s.first_key, s.last_key);
|
|
}
|
|
|
|
future<summary_entry&> sstable::read_summary_entry(size_t i) {
|
|
// The last one is the boundary marker
|
|
if (i >= (_components->summary.entries.size())) {
|
|
throw std::out_of_range(sprint("Invalid Summary index: %ld", i));
|
|
}
|
|
|
|
return make_ready_future<summary_entry&>(_components->summary.entries[i]);
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, deletion_time& d) {
|
|
return parse(in, d.local_deletion_time, d.marked_for_delete_at);
|
|
}
|
|
|
|
template <typename Child>
|
|
future<> parse(random_access_reader& in, std::unique_ptr<metadata>& p) {
|
|
p.reset(new Child);
|
|
return parse(in, *static_cast<Child *>(p.get()));
|
|
}
|
|
|
|
template <typename Child>
|
|
inline void write(file_writer& out, const std::unique_ptr<metadata>& p) {
|
|
write(out, *static_cast<Child *>(p.get()));
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, statistics& s) {
|
|
return parse(in, s.hash).then([&in, &s] {
|
|
return do_for_each(s.hash.map.begin(), s.hash.map.end(), [&in, &s] (auto val) mutable {
|
|
in.seek(val.second);
|
|
|
|
switch (val.first) {
|
|
case metadata_type::Validation:
|
|
return parse<validation_metadata>(in, s.contents[val.first]);
|
|
case metadata_type::Compaction:
|
|
return parse<compaction_metadata>(in, s.contents[val.first]);
|
|
case metadata_type::Stats:
|
|
return parse<stats_metadata>(in, s.contents[val.first]);
|
|
default:
|
|
sstlog.warn("Invalid metadata type at Statistics file: {} ", int(val.first));
|
|
return make_ready_future<>();
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, const statistics& s) {
|
|
write(out, s.hash);
|
|
auto types = boost::copy_range<std::vector<metadata_type>>(s.hash.map | boost::adaptors::map_keys);
|
|
// use same sort order as seal_statistics
|
|
boost::sort(types);
|
|
for (auto t : types) {
|
|
s.contents.at(t)->write(out);
|
|
}
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, utils::estimated_histogram& eh) {
|
|
auto len = std::make_unique<uint32_t>();
|
|
|
|
auto f = parse(in, *len);
|
|
return f.then([&in, &eh, len = std::move(len)] {
|
|
uint32_t length = *len;
|
|
|
|
if (length == 0) {
|
|
throw malformed_sstable_exception("Estimated histogram with zero size found. Can't continue!");
|
|
}
|
|
eh.bucket_offsets.resize(length - 1);
|
|
eh.buckets.resize(length);
|
|
|
|
auto type_size = sizeof(uint64_t) * 2;
|
|
return in.read_exactly(length * type_size).then([&eh, length, type_size] (auto buf) {
|
|
check_buf_size(buf, length * type_size);
|
|
|
|
auto *nr = reinterpret_cast<const net::packed<uint64_t> *>(buf.get());
|
|
size_t j = 0;
|
|
for (size_t i = 0; i < length; ++i) {
|
|
eh.bucket_offsets[i == 0 ? 0 : i - 1] = net::ntoh(nr[j++]);
|
|
eh.buckets[i] = net::ntoh(nr[j++]);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, const utils::estimated_histogram& eh) {
|
|
uint32_t len = 0;
|
|
check_truncate_and_assign(len, eh.buckets.size());
|
|
|
|
write(out, len);
|
|
struct element {
|
|
uint64_t offsets;
|
|
uint64_t buckets;
|
|
};
|
|
std::vector<element> elements;
|
|
elements.resize(eh.buckets.size());
|
|
|
|
auto *offsets_nr = reinterpret_cast<const net::packed<uint64_t> *>(eh.bucket_offsets.data());
|
|
auto *buckets_nr = reinterpret_cast<const net::packed<uint64_t> *>(eh.buckets.data());
|
|
for (size_t i = 0; i < eh.buckets.size(); i++) {
|
|
elements[i].offsets = net::hton(offsets_nr[i == 0 ? 0 : i - 1]);
|
|
elements[i].buckets = net::hton(buckets_nr[i]);
|
|
}
|
|
|
|
auto p = reinterpret_cast<const char*>(elements.data());
|
|
auto bytes = elements.size() * sizeof(element);
|
|
out.write(p, bytes).get();
|
|
}
|
|
|
|
// This is small enough, and well-defined. Easier to just read it all
|
|
// at once
|
|
future<> sstable::read_toc() {
|
|
if (_recognized_components.size()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto file_path = filename(sstable::component_type::TOC);
|
|
|
|
sstlog.debug("Reading TOC file {} ", file_path);
|
|
|
|
return open_checked_file_dma(_read_error_handler, file_path, open_flags::ro).then([this, file_path] (file f) {
|
|
auto bufptr = allocate_aligned_buffer<char>(4096, 4096);
|
|
auto buf = bufptr.get();
|
|
|
|
auto fut = f.dma_read(0, buf, 4096);
|
|
return std::move(fut).then([this, f = std::move(f), bufptr = std::move(bufptr), file_path] (size_t size) mutable {
|
|
// This file is supposed to be very small. Theoretically we should check its size,
|
|
// but if we so much as read a whole page from it, there is definitely something fishy
|
|
// going on - and this simplifies the code.
|
|
if (size >= 4096) {
|
|
throw malformed_sstable_exception("SSTable too big: " + to_sstring(size) + " bytes", file_path);
|
|
}
|
|
|
|
std::experimental::string_view buf(bufptr.get(), size);
|
|
std::vector<sstring> comps;
|
|
|
|
boost::split(comps , buf, boost::is_any_of("\n"));
|
|
|
|
for (auto& c: comps) {
|
|
// accept trailing newlines
|
|
if (c == "") {
|
|
continue;
|
|
}
|
|
try {
|
|
_recognized_components.insert(reverse_map(c, _component_map));
|
|
} catch (std::out_of_range& oor) {
|
|
_unrecognized_components.push_back(c);
|
|
sstlog.info("Unrecognized TOC component was found: {} in sstable {}", c, file_path);
|
|
}
|
|
}
|
|
if (!_recognized_components.size()) {
|
|
throw malformed_sstable_exception("Empty TOC", file_path);
|
|
}
|
|
return f.close().finally([f] {});
|
|
});
|
|
}).then_wrapped([file_path] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (e.code() == std::error_code(ENOENT, std::system_category())) {
|
|
throw malformed_sstable_exception(file_path + ": file not found");
|
|
}
|
|
throw;
|
|
}
|
|
});
|
|
|
|
}
|
|
|
|
void sstable::generate_toc(compressor c, double filter_fp_chance) {
|
|
// Creating table of components.
|
|
_recognized_components.insert(component_type::TOC);
|
|
_recognized_components.insert(component_type::Statistics);
|
|
_recognized_components.insert(component_type::Digest);
|
|
_recognized_components.insert(component_type::Index);
|
|
_recognized_components.insert(component_type::Summary);
|
|
_recognized_components.insert(component_type::Data);
|
|
if (filter_fp_chance != 1.0) {
|
|
_recognized_components.insert(component_type::Filter);
|
|
}
|
|
if (c == compressor::none) {
|
|
_recognized_components.insert(component_type::CRC);
|
|
} else {
|
|
_recognized_components.insert(component_type::CompressionInfo);
|
|
}
|
|
_recognized_components.insert(component_type::Scylla);
|
|
}
|
|
|
|
void sstable::write_toc(const io_priority_class& pc) {
|
|
auto file_path = filename(sstable::component_type::TemporaryTOC);
|
|
|
|
sstlog.debug("Writing TOC file {} ", file_path);
|
|
|
|
// Writing TOC content to temporary file.
|
|
// If creation of temporary TOC failed, it implies that that boot failed to
|
|
// delete a sstable with temporary for this column family, or there is a
|
|
// sstable being created in parallel with the same generation.
|
|
file f = new_sstable_component_file(_write_error_handler, file_path, open_flags::wo | open_flags::create | open_flags::exclusive).get0();
|
|
|
|
bool toc_exists = file_exists(filename(sstable::component_type::TOC)).get0();
|
|
if (toc_exists) {
|
|
// TOC will exist at this point if write_components() was called with
|
|
// the generation of a sstable that exists.
|
|
f.close().get();
|
|
remove_file(file_path).get();
|
|
throw std::runtime_error(sprint("SSTable write failed due to existence of TOC file for generation %ld of %s.%s", _generation, _schema->ks_name(), _schema->cf_name()));
|
|
}
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = 4096;
|
|
options.io_priority_class = pc;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
|
|
for (auto&& key : _recognized_components) {
|
|
// new line character is appended to the end of each component name.
|
|
auto value = _component_map[key] + "\n";
|
|
bytes b = bytes(reinterpret_cast<const bytes::value_type *>(value.c_str()), value.size());
|
|
write(w, b);
|
|
}
|
|
w.flush().get();
|
|
w.close().get();
|
|
|
|
// Flushing parent directory to guarantee that temporary TOC file reached
|
|
// the disk.
|
|
file dir_f = open_checked_directory(_write_error_handler, _dir).get0();
|
|
sstable_write_io_check([&] {
|
|
dir_f.flush().get();
|
|
dir_f.close().get();
|
|
});
|
|
}
|
|
|
|
future<> sstable::seal_sstable() {
|
|
// SSTable sealing is about renaming temporary TOC file after guaranteeing
|
|
// that each component reached the disk safely.
|
|
return open_checked_directory(_write_error_handler, _dir).then([this] (file dir_f) {
|
|
// Guarantee that every component of this sstable reached the disk.
|
|
return sstable_write_io_check([&] { return dir_f.flush(); }).then([this] {
|
|
// Rename TOC because it's no longer temporary.
|
|
return sstable_write_io_check([&] {
|
|
return engine().rename_file(filename(sstable::component_type::TemporaryTOC), filename(sstable::component_type::TOC));
|
|
});
|
|
}).then([this, dir_f] () mutable {
|
|
// Guarantee that the changes above reached the disk.
|
|
return sstable_write_io_check([&] { return dir_f.flush(); });
|
|
}).then([this, dir_f] () mutable {
|
|
return sstable_write_io_check([&] { return dir_f.close(); });
|
|
}).then([this, dir_f] {
|
|
// If this point was reached, sstable should be safe in disk.
|
|
sstlog.debug("SSTable with generation {} of {}.{} was sealed successfully.", _generation, _schema->ks_name(), _schema->cf_name());
|
|
});
|
|
});
|
|
}
|
|
|
|
void write_crc(io_error_handler& error_handler, const sstring file_path, const checksum& c) {
|
|
sstlog.debug("Writing CRC file {} ", file_path);
|
|
|
|
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
|
file f = new_sstable_component_file(error_handler, file_path, oflags).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = 4096;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
write(w, c);
|
|
w.close().get();
|
|
}
|
|
|
|
// Digest file stores the full checksum of data file converted into a string.
|
|
void write_digest(io_error_handler& error_handler, const sstring file_path, uint32_t full_checksum) {
|
|
sstlog.debug("Writing Digest file {} ", file_path);
|
|
|
|
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
|
auto f = new_sstable_component_file(error_handler, file_path, oflags).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = 4096;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
|
|
auto digest = to_sstring<bytes>(full_checksum);
|
|
write(w, digest);
|
|
w.close().get();
|
|
}
|
|
|
|
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_sample_pattern_cache;
|
|
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_original_index_cache;
|
|
|
|
future<index_list> sstable::read_indexes(uint64_t summary_idx, const io_priority_class& pc) {
|
|
return do_with(get_index_reader(pc), [summary_idx] (index_reader& ir) {
|
|
return ir.get_index_entries(summary_idx).finally([&ir] {
|
|
return ir.close();
|
|
});
|
|
});
|
|
}
|
|
|
|
index_reader sstable::get_index_reader(const io_priority_class& pc) {
|
|
return index_reader(shared_from_this(), pc);
|
|
}
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
future<> sstable::read_simple(T& component, const io_priority_class& pc) {
|
|
|
|
auto file_path = filename(Type);
|
|
sstlog.debug(("Reading " + _component_map[Type] + " file {} ").c_str(), file_path);
|
|
return open_file_dma(file_path, open_flags::ro).then([this, &component] (file fi) {
|
|
auto f = make_checked_file(_read_error_handler, fi);
|
|
auto r = make_lw_shared<file_random_access_reader>(std::move(f), sstable_buffer_size);
|
|
auto fut = parse(*r, component);
|
|
return fut.finally([r = std::move(r)] {
|
|
return r->close();
|
|
}).then([r] {});
|
|
}).then_wrapped([this, file_path] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (e.code() == std::error_code(ENOENT, std::system_category())) {
|
|
throw malformed_sstable_exception(file_path + ": file not found");
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
void sstable::write_simple(const T& component, const io_priority_class& pc) {
|
|
auto file_path = filename(Type);
|
|
sstlog.debug(("Writing " + _component_map[Type] + " file {} ").c_str(), file_path);
|
|
file f = new_sstable_component_file(_write_error_handler, file_path, open_flags::wo | open_flags::create | open_flags::exclusive).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
write(w, component);
|
|
w.flush().get();
|
|
w.close().get();
|
|
}
|
|
|
|
template future<> sstable::read_simple<sstable::component_type::Filter>(sstables::filter& f, const io_priority_class& pc);
|
|
template void sstable::write_simple<sstable::component_type::Filter>(const sstables::filter& f, const io_priority_class& pc);
|
|
|
|
future<> sstable::read_compression(const io_priority_class& pc) {
|
|
// FIXME: If there is no compression, we should expect a CRC file to be present.
|
|
if (!has_component(sstable::component_type::CompressionInfo)) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return read_simple<component_type::CompressionInfo>(_components->compression, pc);
|
|
}
|
|
|
|
void sstable::write_compression(const io_priority_class& pc) {
|
|
if (!has_component(sstable::component_type::CompressionInfo)) {
|
|
return;
|
|
}
|
|
|
|
write_simple<component_type::CompressionInfo>(_components->compression, pc);
|
|
}
|
|
|
|
void sstable::validate_min_max_metadata() {
|
|
auto entry = _components->statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _components->statistics.contents.end()) {
|
|
throw std::runtime_error("Stats metadata not available");
|
|
}
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
|
|
stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
auto is_composite_valid = [] (const bytes& b) {
|
|
auto v = composite_view(b);
|
|
try {
|
|
size_t s = 0;
|
|
for (auto& c : v.components()) {
|
|
s += c.first.size() + sizeof(composite::size_type) + sizeof(composite::eoc_type);
|
|
}
|
|
return s == b.size();
|
|
} catch (marshal_exception&) {
|
|
return false;
|
|
}
|
|
};
|
|
auto clear_incorrect_min_max_column_names = [&s] {
|
|
s.min_column_names.elements.clear();
|
|
s.max_column_names.elements.clear();
|
|
};
|
|
auto& min_column_names = s.min_column_names.elements;
|
|
auto& max_column_names = s.max_column_names.elements;
|
|
|
|
if (min_column_names.empty() && max_column_names.empty()) {
|
|
return;
|
|
}
|
|
|
|
// The min/max metadata is wrong if:
|
|
// 1) it's not empty and schema defines no clustering key.
|
|
// 2) their size differ.
|
|
// 3) column name is stored instead of clustering value.
|
|
// 4) clustering component is stored as composite.
|
|
if ((!_schema->clustering_key_size() && (min_column_names.size() || max_column_names.size())) ||
|
|
(min_column_names.size() != max_column_names.size())) {
|
|
clear_incorrect_min_max_column_names();
|
|
return;
|
|
}
|
|
|
|
for (auto i = 0U; i < min_column_names.size(); i++) {
|
|
if (_schema->get_column_definition(min_column_names[i].value) || _schema->get_column_definition(max_column_names[i].value)) {
|
|
clear_incorrect_min_max_column_names();
|
|
break;
|
|
}
|
|
|
|
if (_schema->is_compound() && _schema->clustering_key_size() > 1 && _schema->is_dense() &&
|
|
(is_composite_valid(min_column_names[i].value) || is_composite_valid(max_column_names[i].value))) {
|
|
clear_incorrect_min_max_column_names();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void sstable::set_clustering_components_ranges() {
|
|
if (!_schema->clustering_key_size()) {
|
|
return;
|
|
}
|
|
auto& min_column_names = get_stats_metadata().min_column_names.elements;
|
|
auto& max_column_names = get_stats_metadata().max_column_names.elements;
|
|
|
|
auto s = std::min(min_column_names.size(), max_column_names.size());
|
|
_clustering_components_ranges.reserve(s);
|
|
for (auto i = 0U; i < s; i++) {
|
|
auto r = nonwrapping_range<bytes_view>({{ min_column_names[i].value, true }}, {{ max_column_names[i].value, true }});
|
|
_clustering_components_ranges.push_back(std::move(r));
|
|
}
|
|
}
|
|
|
|
const std::vector<nonwrapping_range<bytes_view>>& sstable::clustering_components_ranges() const {
|
|
return _clustering_components_ranges;
|
|
}
|
|
|
|
future<> sstable::read_statistics(const io_priority_class& pc) {
|
|
return read_simple<component_type::Statistics>(_components->statistics, pc);
|
|
}
|
|
|
|
void sstable::write_statistics(const io_priority_class& pc) {
|
|
write_simple<component_type::Statistics>(_components->statistics, pc);
|
|
}
|
|
|
|
void sstable::rewrite_statistics(const io_priority_class& pc) {
|
|
auto file_path = filename(component_type::TemporaryStatistics);
|
|
sstlog.debug("Rewriting statistics component of sstable {}", get_filename());
|
|
file f = new_sstable_component_file(_write_error_handler, file_path, open_flags::wo | open_flags::create | open_flags::truncate).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
write(w, _components->statistics);
|
|
w.flush().get();
|
|
w.close().get();
|
|
// rename() guarantees atomicity when renaming a file into place.
|
|
sstable_write_io_check(rename_file, file_path, filename(component_type::Statistics)).get();
|
|
}
|
|
|
|
future<> sstable::read_summary(const io_priority_class& pc) {
|
|
if (_components->summary) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return read_toc().then([this, &pc] {
|
|
// We'll try to keep the main code path exception free, but if an exception does happen
|
|
// we can try to regenerate the Summary.
|
|
if (has_component(sstable::component_type::Summary)) {
|
|
return read_simple<component_type::Summary>(_components->summary, pc).handle_exception([this, &pc] (auto ep) {
|
|
sstlog.warn("Couldn't read summary file {}: {}. Recreating it.", this->filename(component_type::Summary), ep);
|
|
return this->generate_summary(pc);
|
|
});
|
|
} else {
|
|
return generate_summary(pc);
|
|
}
|
|
});
|
|
}
|
|
|
|
future<> sstable::open_data() {
|
|
return when_all(open_checked_file_dma(_read_error_handler, filename(component_type::Index), open_flags::ro),
|
|
open_checked_file_dma(_read_error_handler, filename(component_type::Data), open_flags::ro))
|
|
.then([this] (auto files) {
|
|
_index_file = std::get<file>(std::get<0>(files).get());
|
|
_data_file = std::get<file>(std::get<1>(files).get());
|
|
return this->update_info_for_opened_data();
|
|
});
|
|
}
|
|
|
|
future<> sstable::update_info_for_opened_data() {
|
|
return _data_file.stat().then([this] (struct stat st) {
|
|
if (this->has_component(sstable::component_type::CompressionInfo)) {
|
|
_components->compression.update(st.st_size);
|
|
} else {
|
|
_data_file_size = st.st_size;
|
|
}
|
|
_data_file_write_time = db_clock::from_time_t(st.st_mtime);
|
|
}).then([this] {
|
|
return _index_file.size().then([this] (auto size) {
|
|
_index_file_size = size;
|
|
});
|
|
}).then([this] {
|
|
this->set_clustering_components_ranges();
|
|
this->set_first_and_last_keys();
|
|
|
|
// Get disk usage for this sstable (includes all components).
|
|
_bytes_on_disk = 0;
|
|
return do_for_each(_recognized_components, [this] (component_type c) {
|
|
return this->sstable_write_io_check([&] {
|
|
return engine().file_size(this->filename(c));
|
|
}).then([this] (uint64_t bytes) {
|
|
_bytes_on_disk += bytes;
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> sstable::create_data() {
|
|
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
|
file_open_options opt;
|
|
opt.extent_allocation_size_hint = 32 << 20;
|
|
opt.sloppy_size = true;
|
|
return when_all(new_sstable_component_file(_write_error_handler, filename(component_type::Index), oflags, opt),
|
|
new_sstable_component_file(_write_error_handler, filename(component_type::Data), oflags, opt)).then([this] (auto files) {
|
|
// FIXME: If both files could not be created, the first get below will
|
|
// throw an exception, and second get() will not be attempted, and
|
|
// we'll get a warning about the second future being destructed
|
|
// without its exception being examined.
|
|
_index_file = std::get<file>(std::get<0>(files).get());
|
|
_data_file = std::get<file>(std::get<1>(files).get());
|
|
});
|
|
}
|
|
|
|
// This interface is only used during tests, snapshot loading and early initialization.
|
|
// No need to set tunable priorities for it.
|
|
future<> sstable::load() {
|
|
return read_toc().then([this] {
|
|
return read_statistics(default_priority_class());
|
|
}).then([this] {
|
|
validate_min_max_metadata();
|
|
set_clustering_components_ranges();
|
|
return read_compression(default_priority_class());
|
|
}).then([this] {
|
|
return read_scylla_metadata(default_priority_class());
|
|
}).then([this] {
|
|
return read_filter(default_priority_class());
|
|
}).then([this] {;
|
|
return read_summary(default_priority_class());
|
|
}).then([this] {
|
|
return open_data();
|
|
});
|
|
}
|
|
|
|
future<> sstable::load(sstables::foreign_sstable_open_info info) {
|
|
return read_toc().then([this, info = std::move(info)] () mutable {
|
|
_components = std::move(info.components);
|
|
_data_file = make_checked_file(_read_error_handler, info.data.to_file());
|
|
_index_file = make_checked_file(_read_error_handler, info.index.to_file());
|
|
validate_min_max_metadata();
|
|
return update_info_for_opened_data();
|
|
});
|
|
}
|
|
|
|
future<sstable_open_info> sstable::load_shared_components(const schema_ptr& s, sstring dir, int generation, version_types v, format_types f) {
|
|
auto sst = make_lw_shared<sstables::sstable>(s, dir, generation, v, f);
|
|
return sst->load().then([sst] () mutable {
|
|
auto shards = sst->get_shards_for_this_sstable();
|
|
auto info = sstable_open_info{make_lw_shared<shareable_components>(std::move(*sst->_components)),
|
|
std::move(shards), std::move(sst->_data_file), std::move(sst->_index_file)};
|
|
return make_ready_future<sstable_open_info>(std::move(info));
|
|
});
|
|
}
|
|
|
|
static void output_promoted_index_entry(bytes_ostream& promoted_index,
|
|
const bytes& first_col,
|
|
const bytes& last_col,
|
|
uint64_t offset, uint64_t width) {
|
|
char s[2];
|
|
write_be(s, uint16_t(first_col.size()));
|
|
promoted_index.write(s, 2);
|
|
promoted_index.write(first_col);
|
|
write_be(s, uint16_t(last_col.size()));
|
|
promoted_index.write(s, 2);
|
|
promoted_index.write(last_col);
|
|
char q[8];
|
|
write_be(q, uint64_t(offset));
|
|
promoted_index.write(q, 8);
|
|
write_be(q, uint64_t(width));
|
|
promoted_index.write(q, 8);
|
|
}
|
|
|
|
// FIXME: use this in write_column_name() instead of repeating the code
|
|
static bytes serialize_colname(const composite& clustering_key,
|
|
const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
|
auto c = composite::from_exploded(column_names, marker);
|
|
auto ck_bview = bytes_view(clustering_key);
|
|
// The marker is not a component, so if the last component is empty (IOW,
|
|
// only serializes to the marker), then we just replace the key's last byte
|
|
// with the marker. If the component however it is not empty, then the
|
|
// marker should be in the end of it, and we just join them together as we
|
|
// do for any normal component
|
|
if (c.size() == 1) {
|
|
ck_bview.remove_suffix(1);
|
|
}
|
|
size_t sz = ck_bview.size() + c.size();
|
|
if (sz > std::numeric_limits<uint16_t>::max()) {
|
|
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
|
}
|
|
bytes colname(bytes::initialized_later(), sz);
|
|
std::copy(ck_bview.begin(), ck_bview.end(), colname.begin());
|
|
std::copy(c.get_bytes().begin(), c.get_bytes().end(), colname.begin() + ck_bview.size());
|
|
return colname;
|
|
}
|
|
|
|
// Call maybe_flush_pi_block() before writing the given sstable atom to the
|
|
// output. This may start a new promoted-index block depending on how much
|
|
// data we've already written since the start of the current block. Starting
|
|
// a new block involves both outputting the range of the old block to the
|
|
// index file, and outputting again the currently-open range tombstones to
|
|
// the data file.
|
|
// TODO: currently, maybe_flush_pi_block serializes the column name on every
|
|
// call, saving it in _pi_write.block_last_colname which we need for closing
|
|
// each block, as well as for closing the last block. We could instead save
|
|
// just the unprocessed arguments, and serialize them only when needed at the
|
|
// end of the block. For this we would need this function to take rvalue
|
|
// references (so data is moved in), and need not to use vector of byte_view
|
|
// (which might be gone later).
|
|
void sstable::maybe_flush_pi_block(file_writer& out,
|
|
const composite& clustering_key,
|
|
const std::vector<bytes_view>& column_names) {
|
|
bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none);
|
|
if (_pi_write.block_first_colname.empty()) {
|
|
// This is the first column in the partition, or first column since we
|
|
// closed a promoted-index block. Remember its name and position -
|
|
// we'll need to write it to the promoted index.
|
|
_pi_write.block_start_offset = out.offset();
|
|
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
|
_pi_write.block_first_colname = colname;
|
|
_pi_write.block_last_colname = std::move(colname);
|
|
} else if (out.offset() >= _pi_write.block_next_start_offset) {
|
|
// If we wrote enough bytes to the partition since we output a sample
|
|
// to the promoted index, output one now and start a new one.
|
|
output_promoted_index_entry(_pi_write.data,
|
|
_pi_write.block_first_colname,
|
|
_pi_write.block_last_colname,
|
|
_pi_write.block_start_offset - _c_stats.start_offset,
|
|
out.offset() - _pi_write.block_start_offset);
|
|
_pi_write.numblocks++;
|
|
_pi_write.block_start_offset = out.offset();
|
|
// Because the new block can be read without the previous blocks, we
|
|
// need to repeat the range tombstones which are still open.
|
|
// Note that block_start_offset is before outputting those (so the new
|
|
// block includes them), but we set block_next_start_offset after - so
|
|
// even if we wrote a lot of open tombstones, we still get a full
|
|
// block size of new data.
|
|
if (!clustering_key.empty()) {
|
|
auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
|
clustering_key_prefix::from_range(clustering_key.values()));
|
|
for (const auto& rt : rts) {
|
|
auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
|
|
auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
|
|
write_range_tombstone(out,
|
|
start, rt.start_kind, end, rt.end_kind, {}, rt.tomb);
|
|
}
|
|
}
|
|
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
|
_pi_write.block_first_colname = colname;
|
|
_pi_write.block_last_colname = std::move(colname);
|
|
} else {
|
|
// Keep track of the last column in the partition - we'll need it to close
|
|
// the last block in the promoted index, unfortunately.
|
|
_pi_write.block_last_colname = std::move(colname);
|
|
}
|
|
}
|
|
|
|
void sstable::write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
|
// was defined in the schema, for example.
|
|
auto c = composite::from_exploded(column_names, marker);
|
|
auto ck_bview = bytes_view(clustering_key);
|
|
|
|
// The marker is not a component, so if the last component is empty (IOW,
|
|
// only serializes to the marker), then we just replace the key's last byte
|
|
// with the marker. If the component however it is not empty, then the
|
|
// marker should be in the end of it, and we just join them together as we
|
|
// do for any normal component
|
|
if (c.size() == 1) {
|
|
ck_bview.remove_suffix(1);
|
|
}
|
|
size_t sz = ck_bview.size() + c.size();
|
|
if (sz > std::numeric_limits<uint16_t>::max()) {
|
|
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
|
}
|
|
uint16_t sz16 = sz;
|
|
write(out, sz16, ck_bview, c);
|
|
}
|
|
|
|
void sstable::write_column_name(file_writer& out, bytes_view column_names) {
|
|
size_t sz = column_names.size();
|
|
if (sz > std::numeric_limits<uint16_t>::max()) {
|
|
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
|
}
|
|
uint16_t sz16 = sz;
|
|
write(out, sz16, column_names);
|
|
}
|
|
|
|
|
|
static inline void update_cell_stats(column_stats& c_stats, uint64_t timestamp) {
|
|
c_stats.update_min_timestamp(timestamp);
|
|
c_stats.update_max_timestamp(timestamp);
|
|
c_stats.column_count++;
|
|
}
|
|
|
|
// Intended to write all cell components that follow column name.
|
|
void sstable::write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef) {
|
|
uint64_t timestamp = cell.timestamp();
|
|
|
|
update_cell_stats(_c_stats, timestamp);
|
|
|
|
if (cell.is_dead(_now)) {
|
|
// tombstone cell
|
|
|
|
column_mask mask = column_mask::deletion;
|
|
uint32_t deletion_time_size = sizeof(uint32_t);
|
|
uint32_t deletion_time = cell.deletion_time().time_since_epoch().count();
|
|
|
|
_c_stats.update_max_local_deletion_time(deletion_time);
|
|
_c_stats.tombstone_histogram.update(deletion_time);
|
|
|
|
write(out, mask, timestamp, deletion_time_size, deletion_time);
|
|
} else if (cdef.is_counter()) {
|
|
// counter cell
|
|
assert(!cell.is_counter_update());
|
|
|
|
column_mask mask = column_mask::counter;
|
|
write(out, mask, int64_t(0), timestamp);
|
|
|
|
counter_cell_view ccv(cell);
|
|
auto shard_count = ccv.shard_count();
|
|
|
|
static constexpr auto header_entry_size = sizeof(int16_t);
|
|
static constexpr auto counter_shard_size = 32u; // counter_id: 16 + clock: 8 + value: 8
|
|
auto total_size = sizeof(int16_t) + shard_count * (header_entry_size + counter_shard_size);
|
|
|
|
write(out, int32_t(total_size), int16_t(shard_count));
|
|
for (auto i = 0u; i < shard_count; i++) {
|
|
write<int16_t>(out, std::numeric_limits<int16_t>::min() + i);
|
|
}
|
|
for (auto&& s : ccv.shards()) {
|
|
auto uuid = s.id().to_uuid();
|
|
write(out, int64_t(uuid.get_most_significant_bits()),
|
|
int64_t(uuid.get_least_significant_bits()),
|
|
int64_t(s.logical_clock()), int64_t(s.value()));
|
|
}
|
|
|
|
_c_stats.update_max_local_deletion_time(std::numeric_limits<int>::max());
|
|
} else if (cell.is_live_and_has_ttl()) {
|
|
// expiring cell
|
|
|
|
column_mask mask = column_mask::expiration;
|
|
uint32_t ttl = cell.ttl().count();
|
|
uint32_t expiration = cell.expiry().time_since_epoch().count();
|
|
disk_string_view<uint32_t> cell_value { cell.value() };
|
|
|
|
_c_stats.update_max_local_deletion_time(expiration);
|
|
|
|
write(out, mask, ttl, expiration, timestamp, cell_value);
|
|
} else {
|
|
// regular cell
|
|
|
|
column_mask mask = column_mask::none;
|
|
disk_string_view<uint32_t> cell_value { cell.value() };
|
|
|
|
_c_stats.update_max_local_deletion_time(std::numeric_limits<int>::max());
|
|
|
|
write(out, mask, timestamp, cell_value);
|
|
}
|
|
}
|
|
|
|
void sstable::write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key) {
|
|
if (marker.is_missing()) {
|
|
return;
|
|
}
|
|
|
|
// Write row mark cell to the beginning of clustered row.
|
|
write_column_name(out, clustering_key, { bytes_view() });
|
|
uint64_t timestamp = marker.timestamp();
|
|
uint32_t value_length = 0;
|
|
|
|
update_cell_stats(_c_stats, timestamp);
|
|
|
|
if (marker.is_dead(_now)) {
|
|
column_mask mask = column_mask::deletion;
|
|
uint32_t deletion_time_size = sizeof(uint32_t);
|
|
uint32_t deletion_time = marker.deletion_time().time_since_epoch().count();
|
|
|
|
_c_stats.tombstone_histogram.update(deletion_time);
|
|
|
|
write(out, mask, timestamp, deletion_time_size, deletion_time);
|
|
} else if (marker.is_expiring()) {
|
|
column_mask mask = column_mask::expiration;
|
|
uint32_t ttl = marker.ttl().count();
|
|
uint32_t expiration = marker.expiry().time_since_epoch().count();
|
|
write(out, mask, ttl, expiration, timestamp, value_length);
|
|
} else {
|
|
column_mask mask = column_mask::none;
|
|
write(out, mask, timestamp, value_length);
|
|
}
|
|
}
|
|
|
|
void sstable::write_range_tombstone(file_writer& out,
|
|
const composite& start,
|
|
bound_kind start_kind,
|
|
const composite& end,
|
|
bound_kind end_kind,
|
|
std::vector<bytes_view> suffix,
|
|
const tombstone t) {
|
|
if (!t) {
|
|
return;
|
|
}
|
|
|
|
auto start_marker = start_kind == bound_kind::excl_start
|
|
? composite::eoc::end
|
|
: composite::eoc::start;
|
|
write_column_name(out, start, suffix, start_marker);
|
|
column_mask mask = column_mask::range_tombstone;
|
|
write(out, mask);
|
|
auto end_marker = end_kind == bound_kind::excl_end
|
|
? composite::eoc::start
|
|
: composite::eoc::end;
|
|
write_column_name(out, end, suffix, end_marker);
|
|
uint64_t timestamp = t.timestamp;
|
|
uint32_t deletion_time = t.deletion_time.time_since_epoch().count();
|
|
|
|
update_cell_stats(_c_stats, timestamp);
|
|
_c_stats.update_max_local_deletion_time(deletion_time);
|
|
_c_stats.tombstone_histogram.update(deletion_time);
|
|
|
|
write(out, deletion_time, timestamp);
|
|
}
|
|
|
|
void sstable::write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection) {
|
|
|
|
auto t = static_pointer_cast<const collection_type_impl>(cdef.type);
|
|
auto mview = t->deserialize_mutation_form(collection);
|
|
const bytes& column_name = cdef.name();
|
|
write_range_tombstone(out, clustering_key, clustering_key, { bytes_view(column_name) }, mview.tomb);
|
|
for (auto& cp: mview.cells) {
|
|
maybe_flush_pi_block(out, clustering_key, { column_name, cp.first });
|
|
write_column_name(out, clustering_key, { column_name, cp.first });
|
|
write_cell(out, cp.second, cdef);
|
|
}
|
|
}
|
|
|
|
// This function is about writing a clustered_row to data file according to SSTables format.
|
|
// clustered_row contains a set of cells sharing the same clustering key.
|
|
void sstable::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
|
|
auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());
|
|
|
|
if (schema.is_compound() && !schema.is_dense()) {
|
|
maybe_flush_pi_block(out, clustering_key, { bytes_view() });
|
|
write_row_marker(out, clustered_row.marker(), clustering_key);
|
|
}
|
|
// Before writing cells, range tombstone must be written if the row has any (deletable_row::t).
|
|
if (clustered_row.tomb()) {
|
|
maybe_flush_pi_block(out, clustering_key, {});
|
|
write_range_tombstone(out, clustering_key, clustering_key, {}, clustered_row.tomb());
|
|
// Because we currently may break a partition to promoted-index blocks
|
|
// in the middle of a clustered row, we also need to track the current
|
|
// row's tombstone - not just range tombstones - which may effect the
|
|
// beginning of a new block.
|
|
// TODO: consider starting a new block only between rows, so the
|
|
// following code can be dropped:
|
|
_pi_write.tombstone_accumulator->apply(range_tombstone(
|
|
clustered_row.key(), bound_kind::incl_start,
|
|
clustered_row.key(), bound_kind::incl_end, clustered_row.tomb()));
|
|
}
|
|
|
|
if (schema.clustering_key_size()) {
|
|
column_name_helper::min_max_components(schema, _collector.min_column_names(), _collector.max_column_names(),
|
|
clustered_row.key().components());
|
|
}
|
|
|
|
// Write all cells of a partition's row.
|
|
clustered_row.cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& c) {
|
|
auto&& column_definition = schema.regular_column_at(id);
|
|
// non atomic cell isn't supported yet. atomic cell maps to a single trift cell.
|
|
// non atomic cell maps to multiple trift cell, e.g. collection.
|
|
if (!column_definition.is_atomic()) {
|
|
write_collection(out, clustering_key, column_definition, c.as_collection_mutation());
|
|
return;
|
|
}
|
|
assert(column_definition.is_regular());
|
|
atomic_cell_view cell = c.as_atomic_cell();
|
|
const bytes& column_name = column_definition.name();
|
|
|
|
if (schema.is_compound()) {
|
|
if (schema.is_dense()) {
|
|
maybe_flush_pi_block(out, composite(), { bytes_view(clustering_key) });
|
|
write_column_name(out, bytes_view(clustering_key));
|
|
} else {
|
|
maybe_flush_pi_block(out, clustering_key, { bytes_view(column_name) });
|
|
write_column_name(out, clustering_key, { bytes_view(column_name) });
|
|
}
|
|
} else {
|
|
if (schema.is_dense()) {
|
|
maybe_flush_pi_block(out, composite(), { bytes_view(clustered_row.key().get_component(schema, 0)) });
|
|
write_column_name(out, bytes_view(clustered_row.key().get_component(schema, 0)));
|
|
} else {
|
|
maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
|
|
write_column_name(out, bytes_view(column_name));
|
|
}
|
|
}
|
|
write_cell(out, cell, column_definition);
|
|
});
|
|
}
|
|
|
|
void sstable::write_static_row(file_writer& out, const schema& schema, const row& static_row) {
|
|
static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& c) {
|
|
auto&& column_definition = schema.static_column_at(id);
|
|
if (!column_definition.is_atomic()) {
|
|
auto sp = composite::static_prefix(schema);
|
|
write_collection(out, sp, column_definition, c.as_collection_mutation());
|
|
return;
|
|
}
|
|
assert(column_definition.is_static());
|
|
atomic_cell_view cell = c.as_atomic_cell();
|
|
auto sp = composite::static_prefix(schema);
|
|
maybe_flush_pi_block(out, sp, { bytes_view(column_definition.name()) });
|
|
write_column_name(out, sp, { bytes_view(column_definition.name()) });
|
|
write_cell(out, cell, column_definition);
|
|
});
|
|
}
|
|
|
|
static void write_index_header(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
|
|
write(out, key, pos);
|
|
}
|
|
|
|
static void write_index_promoted(file_writer& out, bytes_ostream& promoted_index,
|
|
deletion_time deltime, uint32_t numblocks) {
|
|
uint32_t promoted_index_size = promoted_index.size();
|
|
if (promoted_index_size) {
|
|
promoted_index_size += 16 /* deltime + numblocks */;
|
|
write(out, promoted_index_size, deltime, numblocks, promoted_index);
|
|
} else {
|
|
write(out, promoted_index_size);
|
|
}
|
|
}
|
|
|
|
static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
|
|
assert(expected_partition_count >= 1);
|
|
|
|
s.header.min_index_interval = min_index_interval;
|
|
s.header.sampling_level = downsampling::BASE_SAMPLING_LEVEL;
|
|
uint64_t max_expected_entries =
|
|
(expected_partition_count / min_index_interval) +
|
|
!!(expected_partition_count % min_index_interval);
|
|
// FIXME: handle case where max_expected_entries is greater than max value stored by uint32_t.
|
|
if (max_expected_entries > std::numeric_limits<uint32_t>::max()) {
|
|
throw malformed_sstable_exception("Current sampling level (" + to_sstring(downsampling::BASE_SAMPLING_LEVEL) + ") not enough to generate summary.");
|
|
}
|
|
|
|
s.keys_written = 0;
|
|
s.header.memory_size = 0;
|
|
}
|
|
|
|
static void seal_summary(summary& s,
|
|
std::experimental::optional<key>&& first_key,
|
|
std::experimental::optional<key>&& last_key) {
|
|
s.header.size = s.entries.size();
|
|
s.header.size_at_full_sampling = s.header.size;
|
|
|
|
s.header.memory_size = s.header.size * sizeof(uint32_t);
|
|
for (auto& e: s.entries) {
|
|
s.positions.push_back(s.header.memory_size);
|
|
s.header.memory_size += e.key.size() + sizeof(e.position);
|
|
}
|
|
assert(first_key); // assume non-empty sstable
|
|
s.first_key.value = first_key->get_bytes();
|
|
|
|
if (last_key) {
|
|
s.last_key.value = last_key->get_bytes();
|
|
} else {
|
|
// An empty last_mutation indicates we had just one partition
|
|
s.last_key.value = s.first_key.value;
|
|
}
|
|
}
|
|
|
|
static void prepare_compression(compression& c, const schema& schema) {
|
|
const auto& cp = schema.get_compressor_params();
|
|
c.set_compressor(cp.get_compressor());
|
|
c.chunk_len = cp.chunk_length();
|
|
c.data_len = 0;
|
|
// FIXME: crc_check_chance can be configured by the user.
|
|
// probability to verify the checksum of a compressed chunk we read.
|
|
// defaults to 1.0.
|
|
c.options.elements.push_back({"crc_check_chance", "1.0"});
|
|
c.init_full_checksum();
|
|
}
|
|
|
|
static void maybe_add_summary_entry(summary& s, bytes_view key, uint64_t offset) {
|
|
// Maybe add summary entry into in-memory representation of summary file.
|
|
if ((s.keys_written++ % s.header.min_index_interval) == 0) {
|
|
s.entries.push_back({ bytes(key.data(), key.size()), offset });
|
|
}
|
|
}
|
|
|
|
static
|
|
void
|
|
populate_statistics_offsets(statistics& s) {
|
|
// copy into a sorted vector to guarantee consistent order
|
|
auto types = boost::copy_range<std::vector<metadata_type>>(s.contents | boost::adaptors::map_keys);
|
|
boost::sort(types);
|
|
|
|
// populate the hash with garbage so we can calculate its size
|
|
for (auto t : types) {
|
|
s.hash.map[t] = -1;
|
|
}
|
|
|
|
auto offset = serialized_size(s.hash);
|
|
for (auto t : types) {
|
|
s.hash.map[t] = offset;
|
|
offset += s.contents[t]->serialized_size();
|
|
}
|
|
}
|
|
|
|
static
|
|
sharding_metadata
|
|
create_sharding_metadata(schema_ptr schema, const dht::decorated_key& first_key, const dht::decorated_key& last_key) {
|
|
auto range = dht::partition_range::make(dht::ring_position(first_key), dht::ring_position(last_key));
|
|
auto sharder = dht::ring_position_range_sharder(std::move(range));
|
|
auto sm = sharding_metadata();
|
|
auto rpras = sharder.next(*schema);
|
|
while (rpras) {
|
|
if (rpras->shard == engine().cpu_id()) {
|
|
// we know left/right are not infinite
|
|
auto&& left = rpras->ring_range.start()->value();
|
|
auto&& right = rpras->ring_range.end()->value();
|
|
auto&& left_token = left.token();
|
|
auto left_exclusive = !left.has_key() && left.bound() == dht::ring_position::token_bound::end;
|
|
auto&& right_token = right.token();
|
|
auto right_exclusive = !right.has_key() && right.bound() == dht::ring_position::token_bound::start;
|
|
sm.token_ranges.elements.push_back({
|
|
{left_exclusive, to_bytes(bytes_view(left_token._data))},
|
|
{right_exclusive, to_bytes(bytes_view(right_token._data))}});
|
|
}
|
|
rpras = sharder.next(*schema);
|
|
}
|
|
return sm;
|
|
}
|
|
|
|
|
|
// In the beginning of the statistics file, there is a disk_hash used to
|
|
// map each metadata type to its correspondent position in the file.
|
|
static void seal_statistics(statistics& s, metadata_collector& collector,
|
|
const sstring partitioner, double bloom_filter_fp_chance, schema_ptr schema,
|
|
const dht::decorated_key& first_key, const dht::decorated_key& last_key) {
|
|
validation_metadata validation;
|
|
compaction_metadata compaction;
|
|
stats_metadata stats;
|
|
|
|
validation.partitioner.value = to_bytes(partitioner);
|
|
validation.filter_chance = bloom_filter_fp_chance;
|
|
s.contents[metadata_type::Validation] = std::make_unique<validation_metadata>(std::move(validation));
|
|
|
|
collector.construct_compaction(compaction);
|
|
s.contents[metadata_type::Compaction] = std::make_unique<compaction_metadata>(std::move(compaction));
|
|
|
|
collector.construct_stats(stats);
|
|
s.contents[metadata_type::Stats] = std::make_unique<stats_metadata>(std::move(stats));
|
|
|
|
populate_statistics_offsets(s);
|
|
}
|
|
|
|
// Returns offset into data component.
|
|
size_t components_writer::get_offset() {
|
|
if (_sst.has_component(sstable::component_type::CompressionInfo)) {
|
|
// Variable returned by compressed_file_length() is constantly updated by compressed output stream.
|
|
return _sst._components->compression.compressed_file_length();
|
|
} else {
|
|
return _out.offset();
|
|
}
|
|
}
|
|
|
|
file_writer components_writer::index_file_writer(sstable& sst, const io_priority_class& pc) {
|
|
file_output_stream_options options;
|
|
options.buffer_size = sst.sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
options.write_behind = 10;
|
|
return file_writer(std::move(sst._index_file), std::move(options));
|
|
}
|
|
|
|
// Get the currently loaded configuration, or the default configuration in
|
|
// case none has been loaded (this happens, for example, in unit tests).
|
|
static const db::config& get_config() {
|
|
if (service::get_storage_service().local_is_initialized() &&
|
|
service::get_local_storage_service().db().local_is_initialized()) {
|
|
return service::get_local_storage_service().db().local().get_config();
|
|
} else {
|
|
static db::config default_config;
|
|
return default_config;
|
|
}
|
|
}
|
|
|
|
components_writer::components_writer(sstable& sst, const schema& s, file_writer& out,
|
|
uint64_t estimated_partitions,
|
|
const sstable_writer_config& cfg,
|
|
const io_priority_class& pc)
|
|
: _sst(sst)
|
|
, _schema(s)
|
|
, _out(out)
|
|
, _index(index_file_writer(sst, pc))
|
|
, _max_sstable_size(cfg.max_sstable_size)
|
|
, _tombstone_written(false)
|
|
{
|
|
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
|
|
_sst._pi_write.desired_block_size = cfg.promoted_index_block_size.value_or(get_config().column_index_size_in_kb() * 1024);
|
|
|
|
prepare_summary(_sst._components->summary, estimated_partitions, _schema.min_index_interval());
|
|
|
|
// FIXME: we may need to set repaired_at stats at this point.
|
|
}
|
|
|
|
void components_writer::consume_new_partition(const dht::decorated_key& dk) {
|
|
// Set current index of data to later compute row size.
|
|
_sst._c_stats.start_offset = _out.offset();
|
|
|
|
_partition_key = key::from_partition_key(_schema, dk.key());
|
|
|
|
maybe_add_summary_entry(_sst._components->summary, bytes_view(*_partition_key), _index.offset());
|
|
_sst._components->filter->add(bytes_view(*_partition_key));
|
|
_sst._collector.add_key(bytes_view(*_partition_key));
|
|
|
|
auto p_key = disk_string_view<uint16_t>();
|
|
p_key.value = bytes_view(*_partition_key);
|
|
|
|
// Write index file entry from partition key into index file.
|
|
// Write an index entry minus the "promoted index" (sample of columns)
|
|
// part. We can only write that after processing the entire partition
|
|
// and collecting the sample of columns.
|
|
write_index_header(_index, p_key, _out.offset());
|
|
_sst._pi_write.data = {};
|
|
_sst._pi_write.numblocks = 0;
|
|
_sst._pi_write.deltime.local_deletion_time = std::numeric_limits<int32_t>::max();
|
|
_sst._pi_write.deltime.marked_for_delete_at = std::numeric_limits<int64_t>::min();
|
|
_sst._pi_write.block_start_offset = _out.offset();
|
|
_sst._pi_write.tombstone_accumulator = range_tombstone_accumulator(_schema, false);
|
|
_sst._pi_write.schemap = &_schema; // sadly we need this
|
|
|
|
// Write partition key into data file.
|
|
write(_out, p_key);
|
|
|
|
_tombstone_written = false;
|
|
}
|
|
|
|
void components_writer::consume(tombstone t) {
|
|
deletion_time d;
|
|
|
|
if (t) {
|
|
d.local_deletion_time = t.deletion_time.time_since_epoch().count();
|
|
d.marked_for_delete_at = t.timestamp;
|
|
|
|
_sst._c_stats.tombstone_histogram.update(d.local_deletion_time);
|
|
_sst._c_stats.update_max_local_deletion_time(d.local_deletion_time);
|
|
_sst._c_stats.update_min_timestamp(d.marked_for_delete_at);
|
|
_sst._c_stats.update_max_timestamp(d.marked_for_delete_at);
|
|
} else {
|
|
// Default values for live, undeleted rows.
|
|
d.local_deletion_time = std::numeric_limits<int32_t>::max();
|
|
d.marked_for_delete_at = std::numeric_limits<int64_t>::min();
|
|
}
|
|
write(_out, d);
|
|
_tombstone_written = true;
|
|
// TODO: need to verify we don't do this twice?
|
|
_sst._pi_write.deltime = d;
|
|
}
|
|
|
|
stop_iteration components_writer::consume(static_row&& sr) {
|
|
ensure_tombstone_is_written();
|
|
_sst.write_static_row(_out, _schema, sr.cells());
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
stop_iteration components_writer::consume(clustering_row&& cr) {
|
|
ensure_tombstone_is_written();
|
|
_sst.write_clustered_row(_out, _schema, cr);
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
stop_iteration components_writer::consume(range_tombstone&& rt) {
|
|
ensure_tombstone_is_written();
|
|
// Remember the range tombstone so when we need to open a new promoted
|
|
// index block, we can figure out which ranges are still open and need
|
|
// to be repeated in the data file. Note that apply() also drops ranges
|
|
// already closed by rt.start, so the accumulator doesn't grow boundless.
|
|
_sst._pi_write.tombstone_accumulator->apply(rt);
|
|
auto start = composite::from_clustering_element(_schema, std::move(rt.start));
|
|
auto end = composite::from_clustering_element(_schema, std::move(rt.end));
|
|
_sst.maybe_flush_pi_block(_out, start, {});
|
|
_sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb);
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
stop_iteration components_writer::consume_end_of_partition() {
|
|
// If there is an incomplete block in the promoted index, write it too.
|
|
// However, if the _promoted_index is still empty, don't add a single
|
|
// chunk - better not output a promoted index at all in this case.
|
|
if (!_sst._pi_write.data.empty() && !_sst._pi_write.block_first_colname.empty()) {
|
|
output_promoted_index_entry(_sst._pi_write.data,
|
|
_sst._pi_write.block_first_colname,
|
|
_sst._pi_write.block_last_colname,
|
|
_sst._pi_write.block_start_offset - _sst._c_stats.start_offset,
|
|
_out.offset() - _sst._pi_write.block_start_offset);
|
|
_sst._pi_write.numblocks++;
|
|
}
|
|
write_index_promoted(_index, _sst._pi_write.data, _sst._pi_write.deltime,
|
|
_sst._pi_write.numblocks);
|
|
_sst._pi_write.data = {};
|
|
_sst._pi_write.block_first_colname = {};
|
|
|
|
ensure_tombstone_is_written();
|
|
int16_t end_of_row = 0;
|
|
write(_out, end_of_row);
|
|
|
|
// compute size of the current row.
|
|
_sst._c_stats.row_size = _out.offset() - _sst._c_stats.start_offset;
|
|
// update is about merging column_stats with the data being stored by collector.
|
|
_sst._collector.update(_schema, std::move(_sst._c_stats));
|
|
_sst._c_stats.reset();
|
|
|
|
if (!_first_key) {
|
|
_first_key = *_partition_key;
|
|
}
|
|
_last_key = std::move(*_partition_key);
|
|
|
|
return get_offset() < _max_sstable_size ? stop_iteration::no : stop_iteration::yes;
|
|
}
|
|
|
|
void components_writer::consume_end_of_stream() {
|
|
seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key)); // what if there is only one partition? what if it is empty?
|
|
|
|
_index.close().get();
|
|
|
|
if (_sst.has_component(sstable::component_type::CompressionInfo)) {
|
|
_sst._collector.add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
|
|
}
|
|
|
|
_sst.set_first_and_last_keys();
|
|
seal_statistics(_sst._components->statistics, _sst._collector, dht::global_partitioner().name(), _schema.bloom_filter_fp_chance(),
|
|
_sst._schema, _sst.get_first_decorated_key(), _sst.get_last_decorated_key());
|
|
}
|
|
|
|
future<> sstable::write_components(memtable& mt, bool backup, const io_priority_class& pc, bool leave_unsealed) {
|
|
_collector.set_replay_position(mt.replay_position());
|
|
sstable_writer_config cfg;
|
|
cfg.backup = backup;
|
|
cfg.leave_unsealed = leave_unsealed;
|
|
return write_components(mt.make_flush_reader(mt.schema(), pc), mt.partition_count(), mt.schema(), cfg, pc);
|
|
}
|
|
|
|
future<>
|
|
sstable::read_scylla_metadata(const io_priority_class& pc) {
|
|
if (_components->scylla_metadata) {
|
|
return make_ready_future<>();
|
|
}
|
|
return read_toc().then([this, &pc] {
|
|
_components->scylla_metadata.emplace(); // engaged optional means we won't try to re-read this again
|
|
if (!has_component(component_type::Scylla)) {
|
|
return make_ready_future<>();
|
|
}
|
|
return read_simple<component_type::Scylla>(*_components->scylla_metadata, pc);
|
|
});
|
|
}
|
|
|
|
void
|
|
sstable::write_scylla_metadata(const io_priority_class& pc) {
|
|
auto&& first_key = get_first_decorated_key();
|
|
auto&& last_key = get_last_decorated_key();
|
|
auto sm = create_sharding_metadata(_schema, first_key, last_key);
|
|
_components->scylla_metadata.emplace();
|
|
_components->scylla_metadata->data.set<scylla_metadata_type::Sharding>(std::move(sm));
|
|
|
|
write_simple<component_type::Scylla>(*_components->scylla_metadata, pc);
|
|
}
|
|
|
|
void sstable_writer::prepare_file_writer()
|
|
{
|
|
file_output_stream_options options;
|
|
options.io_priority_class = _pc;
|
|
options.buffer_size = _sst.sstable_buffer_size;
|
|
options.write_behind = 10;
|
|
|
|
if (!_compression_enabled) {
|
|
_writer = std::make_unique<checksummed_file_writer>(std::move(_sst._data_file), std::move(options), true);
|
|
} else {
|
|
prepare_compression(_sst._components->compression, _schema);
|
|
_writer = std::make_unique<file_writer>(make_compressed_file_output_stream(std::move(_sst._data_file), std::move(options), &_sst._components->compression));
|
|
}
|
|
}
|
|
|
|
void sstable_writer::finish_file_writer()
|
|
{
|
|
auto writer = std::move(_writer);
|
|
writer->close().get();
|
|
|
|
if (!_compression_enabled) {
|
|
auto chksum_wr = static_cast<checksummed_file_writer*>(writer.get());
|
|
write_digest(_sst._write_error_handler, _sst.filename(sstable::component_type::Digest), chksum_wr->full_checksum());
|
|
write_crc(_sst._write_error_handler, _sst.filename(sstable::component_type::CRC), chksum_wr->finalize_checksum());
|
|
} else {
|
|
write_digest(_sst._write_error_handler, _sst.filename(sstable::component_type::Digest), _sst._components->compression.full_checksum());
|
|
}
|
|
}
|
|
|
|
sstable_writer::~sstable_writer() {
|
|
if (_writer) {
|
|
try {
|
|
_writer->close().get();
|
|
} catch (...) {
|
|
sstlog.error("sstable_writer failed to close file: {}", std::current_exception());
|
|
}
|
|
}
|
|
}
|
|
|
|
sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
|
|
const sstable_writer_config& cfg, const io_priority_class& pc)
|
|
: _sst(sst)
|
|
, _schema(s)
|
|
, _pc(pc)
|
|
, _backup(cfg.backup)
|
|
, _leave_unsealed(cfg.leave_unsealed)
|
|
{
|
|
_sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
|
|
_sst.write_toc(_pc);
|
|
_sst.create_data().get();
|
|
_compression_enabled = !_sst.has_component(sstable::component_type::CRC);
|
|
prepare_file_writer();
|
|
_components_writer.emplace(_sst, _schema, *_writer, estimated_partitions, cfg, _pc);
|
|
}
|
|
|
|
void sstable_writer::consume_end_of_stream()
|
|
{
|
|
_components_writer->consume_end_of_stream();
|
|
_components_writer = stdx::nullopt;
|
|
finish_file_writer();
|
|
_sst.write_summary(_pc);
|
|
_sst.write_filter(_pc);
|
|
_sst.write_statistics(_pc);
|
|
_sst.write_compression(_pc);
|
|
_sst.write_scylla_metadata(_pc);
|
|
|
|
if (!_leave_unsealed) {
|
|
_sst.seal_sstable(_backup).get();
|
|
}
|
|
}
|
|
|
|
future<> sstable::seal_sstable(bool backup)
|
|
{
|
|
return seal_sstable().then([this, backup] {
|
|
if (backup) {
|
|
auto dir = get_dir() + "/backups/";
|
|
return sstable_write_io_check(touch_directory, dir).then([this, dir] {
|
|
return create_links(dir);
|
|
});
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
sstable_writer sstable::get_writer(const schema& s, uint64_t estimated_partitions, const sstable_writer_config& cfg, const io_priority_class& pc)
|
|
{
|
|
return sstable_writer(*this, s, estimated_partitions, cfg, pc);
|
|
}
|
|
|
|
future<> sstable::write_components(::mutation_reader mr,
|
|
uint64_t estimated_partitions, schema_ptr schema, const sstable_writer_config& cfg, const io_priority_class& pc) {
|
|
return seastar::async([this, mr = std::move(mr), estimated_partitions, schema = std::move(schema), cfg, &pc] () mutable {
|
|
auto wr = get_writer(*schema, estimated_partitions, cfg, pc);
|
|
consume_flattened_in_thread(mr, wr);
|
|
});
|
|
}
|
|
|
|
future<> sstable::generate_summary(const io_priority_class& pc) {
|
|
if (_components->summary) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
sstlog.info("Summary file {} not found. Generating Summary...", filename(sstable::component_type::Summary));
|
|
class summary_generator {
|
|
summary& _summary;
|
|
public:
|
|
std::experimental::optional<key> first_key, last_key;
|
|
|
|
summary_generator(summary& s) : _summary(s) {}
|
|
bool should_continue() {
|
|
return true;
|
|
}
|
|
void consume_entry(index_entry&& ie, uint64_t offset) {
|
|
maybe_add_summary_entry(_summary, ie.get_key_bytes(), offset);
|
|
if (!first_key) {
|
|
first_key = key(to_bytes(ie.get_key_bytes()));
|
|
} else {
|
|
last_key = key(to_bytes(ie.get_key_bytes()));
|
|
}
|
|
}
|
|
};
|
|
|
|
return open_checked_file_dma(_read_error_handler, filename(component_type::Index), open_flags::ro).then([this, &pc] (file index_file) {
|
|
return do_with(std::move(index_file), [this, &pc] (file index_file) {
|
|
return index_file.size().then([this, &pc, index_file] (auto size) {
|
|
// an upper bound. Surely to be less than this.
|
|
auto estimated_partitions = size / sizeof(uint64_t);
|
|
prepare_summary(_components->summary, estimated_partitions, _schema->min_index_interval());
|
|
|
|
file_input_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
|
|
return do_with(summary_generator(_components->summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
|
|
auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), 0, size);
|
|
return ctx->consume_input(*ctx).finally([ctx] {
|
|
return ctx->close();
|
|
}).then([this, ctx, &s] {
|
|
seal_summary(_components->summary, std::move(s.first_key), std::move(s.last_key));
|
|
});
|
|
});
|
|
}).then([index_file] () mutable {
|
|
return index_file.close().handle_exception([] (auto ep) {
|
|
sstlog.warn("sstable close index_file failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
uint64_t sstable::data_size() const {
|
|
if (has_component(sstable::component_type::CompressionInfo)) {
|
|
return _components->compression.data_len;
|
|
}
|
|
return _data_file_size;
|
|
}
|
|
|
|
uint64_t sstable::bytes_on_disk() {
|
|
assert(_bytes_on_disk > 0);
|
|
return _bytes_on_disk;
|
|
}
|
|
|
|
const bool sstable::has_component(component_type f) const {
|
|
return _recognized_components.count(f);
|
|
}
|
|
|
|
const sstring sstable::filename(component_type f) const {
|
|
return filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, f);
|
|
}
|
|
|
|
std::vector<sstring> sstable::component_filenames() const {
|
|
std::vector<sstring> res;
|
|
for (auto c : _component_map | boost::adaptors::map_keys) {
|
|
if (has_component(c)) {
|
|
res.emplace_back(filename(c));
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
sstring sstable::toc_filename() const {
|
|
return filename(component_type::TOC);
|
|
}
|
|
|
|
const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
|
|
format_types format, component_type component) {
|
|
|
|
static std::unordered_map<version_types, std::function<sstring (entry_descriptor d)>, enum_hash<version_types>> strmap = {
|
|
{ sstable::version_types::ka, [] (entry_descriptor d) {
|
|
return d.ks + "-" + d.cf + "-" + _version_string.at(d.version) + "-" + to_sstring(d.generation) + "-" + _component_map.at(d.component); }
|
|
},
|
|
{ sstable::version_types::la, [] (entry_descriptor d) {
|
|
return _version_string.at(d.version) + "-" + to_sstring(d.generation) + "-" + _format_string.at(d.format) + "-" + _component_map.at(d.component); }
|
|
}
|
|
};
|
|
|
|
return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component));
|
|
}
|
|
|
|
const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
|
|
format_types format, sstring component) {
|
|
static std::unordered_map<version_types, const char*, enum_hash<version_types>> fmtmap = {
|
|
{ sstable::version_types::ka, "{0}-{1}-{2}-{3}-{5}" },
|
|
{ sstable::version_types::la, "{2}-{3}-{4}-{5}" }
|
|
};
|
|
|
|
return dir + "/" + seastar::format(fmtmap[version], ks, cf, _version_string.at(version), to_sstring(generation), _format_string.at(format), component);
|
|
}
|
|
|
|
std::vector<std::pair<sstable::component_type, sstring>> sstable::all_components() const {
|
|
std::vector<std::pair<component_type, sstring>> all;
|
|
all.reserve(_recognized_components.size() + _unrecognized_components.size());
|
|
for (auto& c : _recognized_components) {
|
|
all.push_back(std::make_pair(c, _component_map.at(c)));
|
|
}
|
|
for (auto& c : _unrecognized_components) {
|
|
all.push_back(std::make_pair(component_type::Unknown, c));
|
|
}
|
|
return all;
|
|
}
|
|
|
|
future<> sstable::create_links(sstring dir, int64_t generation) const {
|
|
// TemporaryTOC is always first, TOC is always last
|
|
auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, component_type::TemporaryTOC);
|
|
return sstable_write_io_check(::link_file, filename(component_type::TOC), dst).then([this, dir] {
|
|
return sstable_write_io_check(sync_directory, dir);
|
|
}).then([this, dir, generation] {
|
|
// FIXME: Should clean already-created links if we failed midway.
|
|
return parallel_for_each(all_components(), [this, dir, generation] (auto p) {
|
|
if (p.first == component_type::TOC) {
|
|
return make_ready_future<>();
|
|
}
|
|
auto src = sstable::filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, p.second);
|
|
auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, p.second);
|
|
return this->sstable_write_io_check(::link_file, std::move(src), std::move(dst));
|
|
});
|
|
}).then([this, dir] {
|
|
return sstable_write_io_check(sync_directory, dir);
|
|
}).then([dir, this, generation] {
|
|
auto src = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, component_type::TemporaryTOC);
|
|
auto dst = sstable::filename(dir, _schema->ks_name(), _schema->cf_name(), _version, generation, _format, component_type::TOC);
|
|
return sstable_write_io_check([&] {
|
|
return engine().rename_file(src, dst);
|
|
});
|
|
}).then([this, dir] {
|
|
return sstable_write_io_check(sync_directory, dir);
|
|
});
|
|
}
|
|
|
|
future<> sstable::set_generation(int64_t new_generation) {
|
|
return create_links(_dir, new_generation).then([this] {
|
|
return remove_file(filename(component_type::TOC)).then([this] {
|
|
return sstable_write_io_check(sync_directory, _dir);
|
|
}).then([this] {
|
|
return parallel_for_each(all_components(), [this] (auto p) {
|
|
if (p.first == component_type::TOC) {
|
|
return make_ready_future<>();
|
|
}
|
|
return remove_file(sstable::filename(_dir, _schema->ks_name(), _schema->cf_name(), _version, _generation, _format, p.second));
|
|
});
|
|
});
|
|
}).then([this, new_generation] {
|
|
return sync_directory(_dir).then([this, new_generation] {
|
|
_generation = new_generation;
|
|
});
|
|
});
|
|
}
|
|
|
|
entry_descriptor entry_descriptor::make_descriptor(sstring fname) {
|
|
static std::regex la("la-(\\d+)-(\\w+)-(.*)");
|
|
static std::regex ka("(\\w+)-(\\w+)-ka-(\\d+)-(.*)");
|
|
|
|
std::smatch match;
|
|
|
|
sstable::version_types version;
|
|
|
|
sstring generation;
|
|
sstring format;
|
|
sstring component;
|
|
sstring ks;
|
|
sstring cf;
|
|
|
|
std::string s(fname);
|
|
if (std::regex_match(s, match, la)) {
|
|
sstring ks = "";
|
|
sstring cf = "";
|
|
version = sstable::version_types::la;
|
|
generation = match[1].str();
|
|
format = sstring(match[2].str());
|
|
component = sstring(match[3].str());
|
|
} else if (std::regex_match(s, match, ka)) {
|
|
ks = match[1].str();
|
|
cf = match[2].str();
|
|
version = sstable::version_types::ka;
|
|
format = sstring("big");
|
|
generation = match[3].str();
|
|
component = sstring(match[4].str());
|
|
} else {
|
|
throw malformed_sstable_exception(sprint("invalid version for file %s. Name doesn't match any known version.", fname));
|
|
}
|
|
return entry_descriptor(ks, cf, version, boost::lexical_cast<unsigned long>(generation), sstable::format_from_sstring(format), sstable::component_from_sstring(component));
|
|
}
|
|
|
|
sstable::version_types sstable::version_from_sstring(sstring &s) {
|
|
return reverse_map(s, _version_string);
|
|
}
|
|
|
|
sstable::format_types sstable::format_from_sstring(sstring &s) {
|
|
return reverse_map(s, _format_string);
|
|
}
|
|
|
|
sstable::component_type sstable::component_from_sstring(sstring &s) {
|
|
try {
|
|
return reverse_map(s, _component_map);
|
|
} catch (std::out_of_range&) {
|
|
return component_type::Unknown;
|
|
}
|
|
}
|
|
|
|
input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_priority_class& pc, lw_shared_ptr<file_input_stream_history> history) {
|
|
file_input_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
options.read_ahead = 4;
|
|
options.dynamic_adjustments = std::move(history);
|
|
if (_components->compression) {
|
|
return make_compressed_file_input_stream(_data_file, &_components->compression,
|
|
pos, len, std::move(options));
|
|
} else {
|
|
return make_file_input_stream(_data_file, pos, len, std::move(options));
|
|
}
|
|
}
|
|
|
|
future<temporary_buffer<char>> sstable::data_read(uint64_t pos, size_t len, const io_priority_class& pc) {
|
|
return do_with(data_stream(pos, len, pc, { }), [len] (auto& stream) {
|
|
return stream.read_exactly(len).finally([&stream] {
|
|
return stream.close();
|
|
});
|
|
});
|
|
}
|
|
|
|
void sstable::set_first_and_last_keys() {
|
|
if (_first && _last) {
|
|
return;
|
|
}
|
|
auto decorate_key = [this] (const char *m, const bytes& value) {
|
|
if (value.empty()) {
|
|
throw std::runtime_error(sprint("%s key of summary of %s is empty", m, get_filename()));
|
|
}
|
|
auto pk = key::from_bytes(value).to_partition_key(*_schema);
|
|
return dht::global_partitioner().decorate_key(*_schema, std::move(pk));
|
|
};
|
|
_first = decorate_key("first", _components->summary.first_key.value);
|
|
_last = decorate_key("last", _components->summary.last_key.value);
|
|
}
|
|
|
|
const partition_key& sstable::get_first_partition_key() const {
|
|
return get_first_decorated_key().key();
|
|
}
|
|
|
|
const partition_key& sstable::get_last_partition_key() const {
|
|
return get_last_decorated_key().key();
|
|
}
|
|
|
|
const dht::decorated_key& sstable::get_first_decorated_key() const {
|
|
if (!_first) {
|
|
throw std::runtime_error(sprint("first key of %s wasn't set", get_filename()));
|
|
}
|
|
return *_first;
|
|
}
|
|
|
|
const dht::decorated_key& sstable::get_last_decorated_key() const {
|
|
if (!_last) {
|
|
throw std::runtime_error(sprint("last key of %s wasn't set", get_filename()));
|
|
}
|
|
return *_last;
|
|
}
|
|
|
|
int sstable::compare_by_first_key(const sstable& other) const {
|
|
return get_first_decorated_key().tri_compare(*_schema, other.get_first_decorated_key());
|
|
}
|
|
|
|
double sstable::get_compression_ratio() const {
|
|
if (this->has_component(sstable::component_type::CompressionInfo)) {
|
|
return double(_components->compression.compressed_file_length()) / _components->compression.uncompressed_file_length();
|
|
} else {
|
|
return metadata_collector::NO_COMPRESSION_RATIO;
|
|
}
|
|
}
|
|
|
|
void sstable::set_sstable_level(uint32_t new_level) {
|
|
auto entry = _components->statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _components->statistics.contents.end()) {
|
|
return;
|
|
}
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
sstlog.debug("set level of {} with generation {} from {} to {}", get_filename(), _generation, s.sstable_level, new_level);
|
|
s.sstable_level = new_level;
|
|
}
|
|
|
|
future<> sstable::mutate_sstable_level(uint32_t new_level) {
|
|
if (!has_component(component_type::Statistics)) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto entry = _components->statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _components->statistics.contents.end()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
if (s.sstable_level == new_level) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
s.sstable_level = new_level;
|
|
// Technically we don't have to write the whole file again. But the assumption that
|
|
// we will always write sequentially is a powerful one, and this does not merit an
|
|
// exception.
|
|
return seastar::async([this] {
|
|
// This is not part of the standard memtable flush path, but there is no reason
|
|
// to come up with a class just for that. It is used by the snapshot/restore mechanism
|
|
// which comprises mostly hard link creation and this operation at the end + this operation,
|
|
// and also (eventually) by some compaction strategy. In any of the cases, it won't be high
|
|
// priority enough so we will use the default priority
|
|
rewrite_statistics(default_priority_class());
|
|
});
|
|
}
|
|
|
|
int sstable::compare_by_max_timestamp(const sstable& other) const {
|
|
auto ts1 = get_stats_metadata().max_timestamp;
|
|
auto ts2 = other.get_stats_metadata().max_timestamp;
|
|
return (ts1 > ts2 ? 1 : (ts1 == ts2 ? 0 : -1));
|
|
}
|
|
|
|
sstable::~sstable() {
|
|
if (_index_file) {
|
|
_index_file.close().handle_exception([save = _index_file, op = background_jobs().start()] (auto ep) {
|
|
sstlog.warn("sstable close index_file failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
}
|
|
if (_data_file) {
|
|
_data_file.close().handle_exception([save = _data_file, op = background_jobs().start()] (auto ep) {
|
|
sstlog.warn("sstable close data_file failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
}
|
|
|
|
if (_marked_for_deletion) {
|
|
// We need to delete the on-disk files for this table. Since this is a
|
|
// destructor, we can't wait for this to finish, or return any errors,
|
|
// but just need to do our best. If a deletion fails for some reason we
|
|
// log and ignore this failure, because on startup we'll again try to
|
|
// clean up unused sstables, and because we'll never reuse the same
|
|
// generation number anyway.
|
|
try {
|
|
delete_atomically({sstable_to_delete(filename(component_type::TOC), _shared)}).handle_exception(
|
|
[op = background_jobs().start()] (std::exception_ptr eptr) {
|
|
try {
|
|
std::rethrow_exception(eptr);
|
|
} catch (atomic_deletion_cancelled&) {
|
|
sstlog.debug("Exception when deleting sstable file: {}", eptr);
|
|
} catch (...) {
|
|
sstlog.warn("Exception when deleting sstable file: {}", eptr);
|
|
}
|
|
});
|
|
} catch (...) {
|
|
sstlog.warn("Exception when deleting sstable file: {}", std::current_exception());
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
sstring
|
|
dirname(sstring fname) {
|
|
return boost::filesystem::canonical(std::string(fname)).parent_path().string();
|
|
}
|
|
|
|
future<>
|
|
fsync_directory(const io_error_handler& error_handler, sstring fname) {
|
|
return ::sstable_io_check(error_handler, [&] {
|
|
return open_checked_directory(error_handler, dirname(fname)).then([] (file f) {
|
|
return do_with(std::move(f), [] (file& f) {
|
|
return f.flush();
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<>
|
|
remove_by_toc_name(sstring sstable_toc_name, const io_error_handler& error_handler) {
|
|
return seastar::async([sstable_toc_name, &error_handler] () mutable {
|
|
sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
|
|
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
|
sstring dir;
|
|
|
|
if (sstable_io_check(error_handler, file_exists, sstable_toc_name).get0()) {
|
|
dir = dirname(sstable_toc_name);
|
|
sstable_io_check(error_handler, rename_file, sstable_toc_name, new_toc_name).get();
|
|
fsync_directory(error_handler, dir).get();
|
|
} else if (sstable_io_check(error_handler, file_exists, new_toc_name).get0()) {
|
|
dir = dirname(new_toc_name);
|
|
} else {
|
|
sstlog.warn("Unable to delete {} because it doesn't exist.", sstable_toc_name);
|
|
return;
|
|
}
|
|
|
|
auto toc_file = open_checked_file_dma(error_handler, new_toc_name, open_flags::ro).get0();
|
|
auto in = make_file_input_stream(toc_file);
|
|
auto size = toc_file.size().get0();
|
|
auto text = in.read_exactly(size).get0();
|
|
in.close().get();
|
|
std::vector<sstring> components;
|
|
sstring all(text.begin(), text.end());
|
|
boost::split(components, all, boost::is_any_of("\n"));
|
|
parallel_for_each(components, [prefix, &error_handler] (sstring component) mutable {
|
|
if (component.empty()) {
|
|
// eof
|
|
return make_ready_future<>();
|
|
}
|
|
if (component == TOC_SUFFIX) {
|
|
// already deleted
|
|
return make_ready_future<>();
|
|
}
|
|
auto fname = prefix + component;
|
|
return sstable_io_check(error_handler, remove_file, prefix + component).then_wrapped([fname = std::move(fname)] (future<> f) {
|
|
// forgive ENOENT, since the component may not have been written;
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (!is_system_error_errno(ENOENT)) {
|
|
throw;
|
|
}
|
|
sstlog.debug("Forgiving ENOENT when deleting file {}", fname);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}).get();
|
|
fsync_directory(error_handler, dir).get();
|
|
sstable_io_check(error_handler, remove_file, new_toc_name).get();
|
|
});
|
|
}
|
|
|
|
future<>
|
|
sstable::mark_for_deletion_on_disk() {
|
|
mark_for_deletion();
|
|
|
|
auto toc_name = filename(component_type::TOC);
|
|
auto shard = std::hash<sstring>()(toc_name) % smp::count;
|
|
|
|
return smp::submit_to(shard, [this, toc_name] {
|
|
static thread_local std::unordered_set<sstring> renaming;
|
|
|
|
if (renaming.count(toc_name) > 0) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
renaming.emplace(toc_name);
|
|
|
|
return seastar::async([this, toc_name] {
|
|
if (!sstable_write_io_check(file_exists, toc_name).get0()) {
|
|
return; // already gone
|
|
}
|
|
|
|
auto dir = dirname(toc_name);
|
|
auto toc_file = open_checked_file_dma(_read_error_handler, toc_name, open_flags::ro).get0();
|
|
sstring prefix = toc_name.substr(0, toc_name.size() - TOC_SUFFIX.size());
|
|
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
|
sstable_write_io_check(rename_file, toc_name, new_toc_name).get();
|
|
fsync_directory(_write_error_handler, dir).get();
|
|
}).finally([toc_name] {
|
|
renaming.erase(toc_name);
|
|
});
|
|
});
|
|
}
|
|
|
|
future<>
|
|
sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
|
|
return seastar::async([ks, cf, dir, generation, v, f] {
|
|
const io_error_handler& error_handler = sstable_write_error_handler;
|
|
auto toc = sstable_io_check(error_handler, file_exists, filename(dir, ks, cf, v, generation, f, component_type::TOC)).get0();
|
|
// assert that toc doesn't exist for sstable with temporary toc.
|
|
assert(toc == false);
|
|
|
|
auto tmptoc = sstable_io_check(error_handler, file_exists, filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get0();
|
|
// assert that temporary toc exists for this sstable.
|
|
assert(tmptoc == true);
|
|
|
|
sstlog.warn("Deleting components of sstable from {}.{} of generation {} that has a temporary TOC", ks, cf, generation);
|
|
|
|
for (auto& entry : sstable::_component_map) {
|
|
// Skipping TemporaryTOC because it must be the last component to
|
|
// be deleted, and unordered map doesn't guarantee ordering.
|
|
// This is needed because we may end up with a partial delete in
|
|
// event of a power failure.
|
|
// If TemporaryTOC is deleted prematurely and scylla crashes,
|
|
// the subsequent boot would fail because of that generation
|
|
// missing a TOC.
|
|
if (entry.first == component_type::TemporaryTOC) {
|
|
continue;
|
|
}
|
|
|
|
auto file_path = filename(dir, ks, cf, v, generation, f, entry.first);
|
|
// Skip component that doesn't exist.
|
|
auto exists = sstable_io_check(error_handler, file_exists, file_path).get0();
|
|
if (!exists) {
|
|
continue;
|
|
}
|
|
sstable_io_check(error_handler, remove_file, file_path).get();
|
|
}
|
|
fsync_directory(error_handler, dir).get();
|
|
// Removing temporary
|
|
sstable_io_check(error_handler, remove_file, filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get();
|
|
// Fsync'ing column family dir to guarantee that deletion completed.
|
|
fsync_directory(error_handler, dir).get();
|
|
});
|
|
}
|
|
|
|
future<range<partition_key>>
|
|
sstable::get_sstable_key_range(const schema& s) {
|
|
auto fut = read_summary(default_priority_class());
|
|
return std::move(fut).then([this, &s] () mutable {
|
|
this->set_first_and_last_keys();
|
|
return make_ready_future<range<partition_key>>(range<partition_key>::make(get_first_partition_key(), get_last_partition_key()));
|
|
});
|
|
}
|
|
|
|
future<std::vector<shard_id>>
|
|
sstable::get_owning_shards_from_unloaded() {
|
|
return when_all(read_summary(default_priority_class()), read_scylla_metadata(default_priority_class())).then(
|
|
[this] (std::tuple<future<>, future<>> rets) {
|
|
std::get<0>(rets).get();
|
|
std::get<1>(rets).get();
|
|
set_first_and_last_keys();
|
|
return get_shards_for_this_sstable();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Returns a pair of positions [p1, p2) in the summary file corresponding to entries
|
|
* covered by the specified range, or a disengaged optional if no such pair exists.
|
|
*/
|
|
stdx::optional<std::pair<uint64_t, uint64_t>> sstable::get_sample_indexes_for_range(const dht::token_range& range) {
|
|
auto entries_size = _components->summary.entries.size();
|
|
auto search = [this](bool before, const dht::token& token) {
|
|
auto kind = before ? key::kind::before_all_keys : key::kind::after_all_keys;
|
|
key k(kind);
|
|
// Binary search will never returns positive values.
|
|
return uint64_t((binary_search(_components->summary.entries, k, token) + 1) * -1);
|
|
};
|
|
uint64_t left = 0;
|
|
if (range.start()) {
|
|
left = search(range.start()->is_inclusive(), range.start()->value());
|
|
if (left == entries_size) {
|
|
// left is past the end of the sampling.
|
|
return stdx::nullopt;
|
|
}
|
|
}
|
|
uint64_t right = entries_size;
|
|
if (range.end()) {
|
|
right = search(!range.end()->is_inclusive(), range.end()->value());
|
|
if (right == 0) {
|
|
// The first key is strictly greater than right.
|
|
return stdx::nullopt;
|
|
}
|
|
}
|
|
if (left < right) {
|
|
return stdx::optional<std::pair<uint64_t, uint64_t>>(stdx::in_place_t(), left, right);
|
|
}
|
|
return stdx::nullopt;
|
|
}
|
|
|
|
std::vector<dht::decorated_key> sstable::get_key_samples(const schema& s, const dht::token_range& range) {
|
|
auto index_range = get_sample_indexes_for_range(range);
|
|
std::vector<dht::decorated_key> res;
|
|
if (index_range) {
|
|
for (auto idx = index_range->first; idx < index_range->second; ++idx) {
|
|
auto pkey = _components->summary.entries[idx].get_key().to_partition_key(s);
|
|
res.push_back(dht::global_partitioner().decorate_key(s, std::move(pkey)));
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
uint64_t sstable::estimated_keys_for_range(const dht::token_range& range) {
|
|
auto sample_index_range = get_sample_indexes_for_range(range);
|
|
uint64_t sample_key_count = sample_index_range ? sample_index_range->second - sample_index_range->first : 0;
|
|
// adjust for the current sampling level
|
|
uint64_t estimated_keys = sample_key_count * ((downsampling::BASE_SAMPLING_LEVEL * _components->summary.header.min_index_interval) / _components->summary.header.sampling_level);
|
|
return std::max(uint64_t(1), estimated_keys);
|
|
}
|
|
|
|
std::vector<unsigned>
|
|
sstable::get_shards_for_this_sstable() const {
|
|
std::unordered_set<unsigned> shards;
|
|
dht::partition_range_vector token_ranges;
|
|
const auto* sm = _components->scylla_metadata
|
|
? _components->scylla_metadata->data.get<scylla_metadata_type::Sharding, sharding_metadata>()
|
|
: nullptr;
|
|
if (!sm) {
|
|
token_ranges.push_back(dht::partition_range::make(
|
|
dht::ring_position::starting_at(get_first_decorated_key().token()),
|
|
dht::ring_position::ending_at(get_last_decorated_key().token())));
|
|
} else {
|
|
auto disk_token_range_to_ring_position_range = [] (const disk_token_range& dtr) {
|
|
auto t1 = dht::token(dht::token::kind::key, managed_bytes(bytes_view(dtr.left.token)));
|
|
auto t2 = dht::token(dht::token::kind::key, managed_bytes(bytes_view(dtr.right.token)));
|
|
return dht::partition_range::make(
|
|
(dtr.left.exclusive ? dht::ring_position::ending_at : dht::ring_position::starting_at)(std::move(t1)),
|
|
(dtr.right.exclusive ? dht::ring_position::starting_at : dht::ring_position::ending_at)(std::move(t2)));
|
|
};
|
|
token_ranges = boost::copy_range<dht::partition_range_vector>(
|
|
sm->token_ranges.elements
|
|
| boost::adaptors::transformed(disk_token_range_to_ring_position_range));
|
|
}
|
|
auto sharder = dht::ring_position_range_vector_sharder(std::move(token_ranges));
|
|
auto rpras = sharder.next(*_schema);
|
|
while (rpras) {
|
|
shards.insert(rpras->shard);
|
|
rpras = sharder.next(*_schema);
|
|
}
|
|
return boost::copy_range<std::vector<unsigned>>(shards);
|
|
}
|
|
|
|
utils::hashed_key sstable::make_hashed_key(const schema& s, const partition_key& key) {
|
|
return utils::make_hashed_key(static_cast<bytes_view>(key::from_partition_key(s, key)));
|
|
}
|
|
|
|
std::ostream&
|
|
operator<<(std::ostream& os, const sstable_to_delete& std) {
|
|
return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
|
|
}
|
|
|
|
future<>
|
|
delete_sstables(std::vector<sstring> tocs) {
|
|
// FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
|
|
return parallel_for_each(tocs, [] (sstring name) {
|
|
return remove_by_toc_name(name);
|
|
});
|
|
}
|
|
|
|
static thread_local atomic_deletion_manager g_atomic_deletion_manager(smp::count, delete_sstables);
|
|
|
|
future<>
|
|
delete_atomically(std::vector<sstable_to_delete> ssts) {
|
|
auto shard = engine().cpu_id();
|
|
return smp::submit_to(0, [=] {
|
|
return g_atomic_deletion_manager.delete_atomically(ssts, shard);
|
|
});
|
|
}
|
|
|
|
future<>
|
|
delete_atomically(std::vector<shared_sstable> ssts) {
|
|
std::vector<sstable_to_delete> sstables_to_delete_atomically;
|
|
for (auto&& sst : ssts) {
|
|
sstables_to_delete_atomically.push_back({sst->toc_filename(), sst->is_shared()});
|
|
}
|
|
return delete_atomically(std::move(sstables_to_delete_atomically));
|
|
}
|
|
|
|
void cancel_atomic_deletions() {
|
|
g_atomic_deletion_manager.cancel_atomic_deletions();
|
|
}
|
|
|
|
atomic_deletion_cancelled::atomic_deletion_cancelled(std::vector<sstring> names)
|
|
: _msg(sprint("atomic deletions cancelled; not deleting %s", names)) {
|
|
}
|
|
|
|
const char*
|
|
atomic_deletion_cancelled::what() const noexcept {
|
|
return _msg.c_str();
|
|
}
|
|
|
|
thread_local shared_index_lists::stats shared_index_lists::_shard_stats;
|
|
static thread_local seastar::metrics::metric_groups metrics;
|
|
|
|
void init_metrics() {
|
|
namespace sm = seastar::metrics;
|
|
metrics.add_group("sstables", {
|
|
sm::make_derive("index_page_hits", [] { return shared_index_lists::shard_stats().hits; },
|
|
sm::description("Index page requests which could be satisfied without waiting")),
|
|
sm::make_derive("index_page_misses", [] { return shared_index_lists::shard_stats().misses; },
|
|
sm::description("Index page requests which initiated a read from disk")),
|
|
sm::make_derive("index_page_blocks", [] { return shared_index_lists::shard_stats().blocks; },
|
|
sm::description("Index page requests which needed to wait due to page not being loaded yet")),
|
|
});
|
|
}
|
|
|
|
}
|