mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-21 17:10:35 +00:00
maybe_flush_pi_block, which is called for each cell, assumes that
block_first_colname will be empty when the first cell is encountered
for each partition.
This didn't hold after writing partition which generated no index
entry, because block_first_colname was cleared only when there way any
data written into the promoted index. Fix by always clearing the name.
The effect was that the promoted index entry for the next partition
would be flushed sooner than necessary (still counting since the start
of the previous partition) and with offset pointing to the start of
the current partition. This will cause parsing error when such sstable
is read through promoted index entry because the offset is assumed to
point to a cell not to partition start.
Fixes #1567
Message-Id: <1470909915-4400-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit f1c2481040)
2443 lines
96 KiB
C++
2443 lines
96 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "log.hh"
|
|
#include <vector>
|
|
#include <typeinfo>
|
|
#include <limits>
|
|
#include "core/future.hh"
|
|
#include "core/future-util.hh"
|
|
#include "core/sstring.hh"
|
|
#include "core/fstream.hh"
|
|
#include "core/shared_ptr.hh"
|
|
#include "core/do_with.hh"
|
|
#include "core/thread.hh"
|
|
#include <seastar/core/shared_future.hh>
|
|
#include <seastar/core/byteorder.hh>
|
|
#include <iterator>
|
|
|
|
#include "types.hh"
|
|
#include "sstables.hh"
|
|
#include "compress.hh"
|
|
#include "unimplemented.hh"
|
|
#include "index_reader.hh"
|
|
#include "remove.hh"
|
|
#include "memtable.hh"
|
|
#include "range.hh"
|
|
#include "downsampling.hh"
|
|
#include <boost/filesystem/operations.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/range/adaptor/map.hpp>
|
|
#include <boost/range/adaptor/transformed.hpp>
|
|
#include <boost/range/algorithm_ext/insert.hpp>
|
|
#include <boost/range/algorithm_ext/push_back.hpp>
|
|
#include <boost/range/algorithm/set_algorithm.hpp>
|
|
#include <regex>
|
|
#include <core/align.hh>
|
|
#include "utils/phased_barrier.hh"
|
|
#include "range_tombstone_list.hh"
|
|
|
|
#include "checked-file-impl.hh"
|
|
#include "disk-error-handler.hh"
|
|
#include "service/storage_service.hh"
|
|
|
|
thread_local disk_error_signal_type sstable_read_error;
|
|
thread_local disk_error_signal_type sstable_write_error;
|
|
|
|
namespace sstables {
|
|
|
|
logging::logger sstlog("sstable");
|
|
|
|
future<file> new_sstable_component_file(disk_error_signal_type& signal, sstring name, open_flags flags) {
|
|
return open_checked_file_dma(signal, name, flags).handle_exception([name] (auto ep) {
|
|
sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
|
|
return make_exception_future<file>(ep);
|
|
});
|
|
}
|
|
|
|
future<file> new_sstable_component_file(disk_error_signal_type& signal, sstring name, open_flags flags, file_open_options options) {
|
|
return open_checked_file_dma(signal, name, flags, options).handle_exception([name] (auto ep) {
|
|
sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
|
|
return make_exception_future<file>(ep);
|
|
});
|
|
}
|
|
|
|
static utils::phased_barrier& background_jobs() {
|
|
static thread_local utils::phased_barrier gate;
|
|
return gate;
|
|
}
|
|
|
|
future<> await_background_jobs() {
|
|
sstlog.debug("Waiting for background jobs");
|
|
return background_jobs().advance_and_await().finally([] {
|
|
sstlog.debug("Waiting done");
|
|
});
|
|
}
|
|
|
|
future<> await_background_jobs_on_all_shards() {
|
|
return smp::invoke_on_all([] {
|
|
return await_background_jobs();
|
|
});
|
|
}
|
|
|
|
class random_access_reader {
|
|
input_stream<char> _in;
|
|
protected:
|
|
virtual input_stream<char> open_at(uint64_t pos) = 0;
|
|
public:
|
|
future<temporary_buffer<char>> read_exactly(size_t n) {
|
|
return _in.read_exactly(n);
|
|
}
|
|
void seek(uint64_t pos) {
|
|
_in = open_at(pos);
|
|
}
|
|
bool eof() { return _in.eof(); }
|
|
virtual future<> close() {
|
|
return make_ready_future<>();
|
|
// FIXME: return _in.close();
|
|
}
|
|
virtual ~random_access_reader() { }
|
|
};
|
|
|
|
class file_random_access_reader : public random_access_reader {
|
|
file _file;
|
|
size_t _buffer_size;
|
|
public:
|
|
virtual input_stream<char> open_at(uint64_t pos) override {
|
|
file_input_stream_options options;
|
|
options.buffer_size = _buffer_size;
|
|
|
|
return make_file_input_stream(_file, pos, std::move(options));
|
|
}
|
|
explicit file_random_access_reader(file f, size_t buffer_size = 8192)
|
|
: _file(std::move(f)), _buffer_size(buffer_size)
|
|
{
|
|
seek(0);
|
|
}
|
|
virtual future<> close() override {
|
|
return random_access_reader::close().then([this] {
|
|
return _file.close().handle_exception([save = _file] (auto ep) {
|
|
sstlog.warn("sstable close failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
});
|
|
}
|
|
};
|
|
|
|
std::unordered_map<sstable::version_types, sstring, enum_hash<sstable::version_types>> sstable::_version_string = {
|
|
{ sstable::version_types::ka , "ka" },
|
|
{ sstable::version_types::la , "la" }
|
|
};
|
|
|
|
std::unordered_map<sstable::format_types, sstring, enum_hash<sstable::format_types>> sstable::_format_string = {
|
|
{ sstable::format_types::big , "big" }
|
|
};
|
|
|
|
static const sstring TOC_SUFFIX = "TOC.txt";
|
|
static const sstring TEMPORARY_TOC_SUFFIX = "TOC.txt.tmp";
|
|
|
|
// FIXME: this should be version-dependent
|
|
std::unordered_map<sstable::component_type, sstring, enum_hash<sstable::component_type>> sstable::_component_map = {
|
|
{ component_type::Index, "Index.db"},
|
|
{ component_type::CompressionInfo, "CompressionInfo.db" },
|
|
{ component_type::Data, "Data.db" },
|
|
{ component_type::TOC, TOC_SUFFIX },
|
|
{ component_type::Summary, "Summary.db" },
|
|
{ component_type::Digest, "Digest.sha1" },
|
|
{ component_type::CRC, "CRC.db" },
|
|
{ component_type::Filter, "Filter.db" },
|
|
{ component_type::Statistics, "Statistics.db" },
|
|
{ component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX },
|
|
{ component_type::TemporaryStatistics, "Statistics.db.tmp" },
|
|
};
|
|
|
|
// This assumes that the mappings are small enough, and called unfrequent
|
|
// enough. If that changes, it would be adviseable to create a full static
|
|
// reverse mapping, even if it is done at runtime.
|
|
template <typename Map>
|
|
static typename Map::key_type reverse_map(const typename Map::mapped_type& value, Map& map) {
|
|
for (auto& pair: map) {
|
|
if (pair.second == value) {
|
|
return pair.first;
|
|
}
|
|
}
|
|
throw std::out_of_range("unable to reverse map");
|
|
}
|
|
|
|
// This should be used every time we use read_exactly directly.
|
|
//
|
|
// read_exactly is a lot more convenient of an interface to use, because we'll
|
|
// be parsing known quantities.
|
|
//
|
|
// However, anything other than the size we have asked for, is certainly a bug,
|
|
// and we need to do something about it.
|
|
static void check_buf_size(temporary_buffer<char>& buf, size_t expected) {
|
|
if (buf.size() < expected) {
|
|
throw bufsize_mismatch_exception(buf.size(), expected);
|
|
}
|
|
}
|
|
|
|
template <typename T, typename U>
|
|
static void check_truncate_and_assign(T& to, const U from) {
|
|
static_assert(std::is_integral<T>::value && std::is_integral<U>::value, "T and U must be integral");
|
|
to = from;
|
|
if (to != from) {
|
|
throw std::overflow_error("assigning U to T caused an overflow");
|
|
}
|
|
}
|
|
|
|
// Base parser, parses an integer type
|
|
template <typename T>
|
|
typename std::enable_if_t<std::is_integral<T>::value, void>
|
|
read_integer(temporary_buffer<char>& buf, T& i) {
|
|
auto *nr = reinterpret_cast<const net::packed<T> *>(buf.get());
|
|
i = net::ntoh(*nr);
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if_t<std::is_integral<T>::value, future<>>
|
|
parse(random_access_reader& in, T& i) {
|
|
return in.read_exactly(sizeof(T)).then([&i] (auto buf) {
|
|
check_buf_size(buf, sizeof(T));
|
|
|
|
read_integer(buf, i);
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
template <typename T>
|
|
inline typename std::enable_if_t<std::is_integral<T>::value, void>
|
|
write(file_writer& out, T i) {
|
|
auto *nr = reinterpret_cast<const net::packed<T> *>(&i);
|
|
i = net::hton(*nr);
|
|
auto p = reinterpret_cast<const char*>(&i);
|
|
out.write(p, sizeof(T)).get();
|
|
}
|
|
|
|
template <typename T>
|
|
typename std::enable_if_t<std::is_enum<T>::value, future<>>
|
|
parse(random_access_reader& in, T& i) {
|
|
return parse(in, reinterpret_cast<typename std::underlying_type<T>::type&>(i));
|
|
}
|
|
|
|
template <typename T>
|
|
inline typename std::enable_if_t<std::is_enum<T>::value, void>
|
|
write(file_writer& out, T i) {
|
|
write(out, static_cast<typename std::underlying_type<T>::type>(i));
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, bool& i) {
|
|
return parse(in, reinterpret_cast<uint8_t&>(i));
|
|
}
|
|
|
|
inline void write(file_writer& out, bool i) {
|
|
write(out, static_cast<uint8_t>(i));
|
|
}
|
|
|
|
template <typename To, typename From>
|
|
static inline To convert(From f) {
|
|
static_assert(sizeof(To) == sizeof(From), "Sizes must match");
|
|
union {
|
|
To to;
|
|
From from;
|
|
} conv;
|
|
|
|
conv.from = f;
|
|
return conv.to;
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, double& d) {
|
|
return in.read_exactly(sizeof(double)).then([&d] (auto buf) {
|
|
check_buf_size(buf, sizeof(double));
|
|
|
|
auto *nr = reinterpret_cast<const net::packed<unsigned long> *>(buf.get());
|
|
d = convert<double>(net::ntoh(*nr));
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, double d) {
|
|
auto *nr = reinterpret_cast<const net::packed<unsigned long> *>(&d);
|
|
auto tmp = net::hton(*nr);
|
|
auto p = reinterpret_cast<const char*>(&tmp);
|
|
out.write(p, sizeof(unsigned long)).get();
|
|
}
|
|
|
|
template <typename T>
|
|
future<> parse(random_access_reader& in, T& len, bytes& s) {
|
|
return in.read_exactly(len).then([&s, len] (auto buf) {
|
|
check_buf_size(buf, len);
|
|
// Likely a different type of char. Most bufs are unsigned, whereas the bytes type is signed.
|
|
s = bytes(reinterpret_cast<const bytes::value_type *>(buf.get()), len);
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, bytes& s) {
|
|
out.write(s).get();
|
|
}
|
|
|
|
inline void write(file_writer& out, bytes_view s) {
|
|
out.write(reinterpret_cast<const char*>(s.data()), s.size()).get();
|
|
}
|
|
|
|
inline void write(file_writer& out, bytes_ostream s) {
|
|
for (bytes_view fragment : s) {
|
|
write(out, fragment);
|
|
}
|
|
}
|
|
|
|
// All composite parsers must come after this
|
|
template<typename First, typename... Rest>
|
|
future<> parse(random_access_reader& in, First& first, Rest&&... rest) {
|
|
return parse(in, first).then([&in, &rest...] {
|
|
return parse(in, std::forward<Rest>(rest)...);
|
|
});
|
|
}
|
|
|
|
template<typename First, typename... Rest>
|
|
inline void write(file_writer& out, First& first, Rest&&... rest) {
|
|
write(out, first);
|
|
write(out, std::forward<Rest>(rest)...);
|
|
}
|
|
|
|
// Intended to be used for a type that describes itself through describe_type().
|
|
template <class T>
|
|
typename std::enable_if_t<!std::is_integral<T>::value && !std::is_enum<T>::value, future<>>
|
|
parse(random_access_reader& in, T& t) {
|
|
return t.describe_type([&in] (auto&&... what) -> future<> {
|
|
return parse(in, what...);
|
|
});
|
|
}
|
|
|
|
template <class T>
|
|
inline typename std::enable_if_t<!std::is_integral<T>::value && !std::is_enum<T>::value, void>
|
|
write(file_writer& out, T& t) {
|
|
t.describe_type([&out] (auto&&... what) -> void {
|
|
write(out, std::forward<decltype(what)>(what)...);
|
|
});
|
|
}
|
|
|
|
// For all types that take a size, we provide a template that takes the type
|
|
// alone, and another, separate one, that takes a size parameter as well, of
|
|
// type Size. This is because although most of the time the size and the data
|
|
// are contiguous, it is not always the case. So we want to have the
|
|
// flexibility of parsing them separately.
|
|
template <typename Size>
|
|
future<> parse(random_access_reader& in, disk_string<Size>& s) {
|
|
auto len = std::make_unique<Size>();
|
|
auto f = parse(in, *len);
|
|
return f.then([&in, &s, len = std::move(len)] {
|
|
return parse(in, *len, s.value);
|
|
});
|
|
}
|
|
|
|
template <typename Size>
|
|
inline void write(file_writer& out, disk_string<Size>& s) {
|
|
Size len = 0;
|
|
check_truncate_and_assign(len, s.value.size());
|
|
write(out, len);
|
|
write(out, s.value);
|
|
}
|
|
|
|
template <typename Size>
|
|
inline void write(file_writer& out, disk_string_view<Size>& s) {
|
|
Size len;
|
|
check_truncate_and_assign(len, s.value.size());
|
|
write(out, len, s.value);
|
|
}
|
|
|
|
// We cannot simply read the whole array at once, because we don't know its
|
|
// full size. We know the number of elements, but if we are talking about
|
|
// disk_strings, for instance, we have no idea how much of the stream each
|
|
// element will take.
|
|
//
|
|
// Sometimes we do know the size, like the case of integers. There, all we have
|
|
// to do is to convert each member because they are all stored big endian.
|
|
// We'll offer a specialization for that case below.
|
|
template <typename Size, typename Members>
|
|
typename std::enable_if_t<!std::is_integral<Members>::value, future<>>
|
|
parse(random_access_reader& in, Size& len, std::deque<Members>& arr) {
|
|
|
|
auto count = make_lw_shared<size_t>(0);
|
|
auto eoarr = [count, len] { return *count == len; };
|
|
|
|
return do_until(eoarr, [count, &in, &arr] {
|
|
return parse(in, arr[(*count)++]);
|
|
});
|
|
}
|
|
|
|
template <typename Size, typename Members>
|
|
typename std::enable_if_t<std::is_integral<Members>::value, future<>>
|
|
parse(random_access_reader& in, Size& len, std::deque<Members>& arr) {
|
|
auto done = make_lw_shared<size_t>(0);
|
|
return repeat([&in, &len, &arr, done] {
|
|
auto now = std::min(len - *done, 100000 / sizeof(Members));
|
|
return in.read_exactly(now * sizeof(Members)).then([&arr, len, now, done] (auto buf) {
|
|
check_buf_size(buf, now * sizeof(Members));
|
|
|
|
auto *nr = reinterpret_cast<const net::packed<Members> *>(buf.get());
|
|
for (size_t i = 0; i < now; ++i) {
|
|
arr[*done + i] = net::ntoh(nr[i]);
|
|
}
|
|
*done += now;
|
|
return make_ready_future<stop_iteration>(*done == len ? stop_iteration::yes : stop_iteration::no);
|
|
});
|
|
});
|
|
}
|
|
|
|
// We resize the array here, before we pass it to the integer / non-integer
|
|
// specializations
|
|
template <typename Size, typename Members>
|
|
future<> parse(random_access_reader& in, disk_array<Size, Members>& arr) {
|
|
auto len = make_lw_shared<Size>();
|
|
auto f = parse(in, *len);
|
|
return f.then([&in, &arr, len] {
|
|
arr.elements.resize(*len);
|
|
return parse(in, *len, arr.elements);
|
|
}).finally([len] {});
|
|
}
|
|
|
|
template <typename Members>
|
|
inline typename std::enable_if_t<!std::is_integral<Members>::value, void>
|
|
write(file_writer& out, std::deque<Members>& arr) {
|
|
for (auto& a : arr) {
|
|
write(out, a);
|
|
}
|
|
}
|
|
|
|
template <typename Members>
|
|
inline typename std::enable_if_t<std::is_integral<Members>::value, void>
|
|
write(file_writer& out, std::deque<Members>& arr) {
|
|
std::vector<Members> tmp;
|
|
size_t per_loop = 100000 / sizeof(Members);
|
|
tmp.resize(per_loop);
|
|
size_t idx = 0;
|
|
while (idx != arr.size()) {
|
|
auto now = std::min(arr.size() - idx, per_loop);
|
|
// copy arr into tmp converting each entry into big-endian representation.
|
|
auto nr = arr.begin() + idx;
|
|
for (size_t i = 0; i < now; i++) {
|
|
tmp[i] = net::hton(nr[i]);
|
|
}
|
|
auto p = reinterpret_cast<const char*>(tmp.data());
|
|
auto bytes = now * sizeof(Members);
|
|
out.write(p, bytes).get();
|
|
idx += now;
|
|
}
|
|
}
|
|
|
|
template <typename Size, typename Members>
|
|
inline void write(file_writer& out, disk_array<Size, Members>& arr) {
|
|
Size len = 0;
|
|
check_truncate_and_assign(len, arr.elements.size());
|
|
write(out, len);
|
|
write(out, arr.elements);
|
|
}
|
|
|
|
template <typename Size, typename Key, typename Value>
|
|
future<> parse(random_access_reader& in, Size& len, std::unordered_map<Key, Value>& map) {
|
|
return do_with(Size(), [&in, len, &map] (Size& count) {
|
|
auto eos = [len, &count] { return len == count++; };
|
|
return do_until(eos, [len, &in, &map] {
|
|
struct kv {
|
|
Key key;
|
|
Value value;
|
|
};
|
|
|
|
return do_with(kv(), [&in, &map] (auto& el) {
|
|
return parse(in, el.key, el.value).then([&el, &map] {
|
|
map.emplace(el.key, el.value);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
template <typename Size, typename Key, typename Value>
|
|
future<> parse(random_access_reader& in, disk_hash<Size, Key, Value>& h) {
|
|
auto w = std::make_unique<Size>();
|
|
auto f = parse(in, *w);
|
|
return f.then([&in, &h, w = std::move(w)] {
|
|
return parse(in, *w, h.map);
|
|
});
|
|
}
|
|
|
|
template <typename Key, typename Value>
|
|
inline void write(file_writer& out, std::unordered_map<Key, Value>& map) {
|
|
for (auto& val: map) {
|
|
write(out, val.first, val.second);
|
|
};
|
|
}
|
|
|
|
template <typename Size, typename Key, typename Value>
|
|
inline void write(file_writer& out, disk_hash<Size, Key, Value>& h) {
|
|
Size len = 0;
|
|
check_truncate_and_assign(len, h.map.size());
|
|
write(out, len);
|
|
write(out, h.map);
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, summary& s) {
|
|
using pos_type = typename decltype(summary::positions)::value_type;
|
|
|
|
return parse(in, s.header.min_index_interval,
|
|
s.header.size,
|
|
s.header.memory_size,
|
|
s.header.sampling_level,
|
|
s.header.size_at_full_sampling).then([&in, &s] {
|
|
return in.read_exactly(s.header.size * sizeof(pos_type)).then([&in, &s] (auto buf) {
|
|
auto len = s.header.size * sizeof(pos_type);
|
|
check_buf_size(buf, len);
|
|
|
|
s.entries.resize(s.header.size);
|
|
|
|
auto *nr = reinterpret_cast<const pos_type *>(buf.get());
|
|
s.positions = std::deque<pos_type>(nr, nr + s.header.size);
|
|
|
|
// Since the keys in the index are not sized, we need to calculate
|
|
// the start position of the index i+1 to determine the boundaries
|
|
// of index i. The "memory_size" field in the header determines the
|
|
// total memory used by the map, so if we push it to the vector, we
|
|
// can guarantee that no conditionals are used, and we can always
|
|
// query the position of the "next" index.
|
|
s.positions.push_back(s.header.memory_size);
|
|
}).then([&in, &s] {
|
|
in.seek(sizeof(summary::header) + s.header.memory_size);
|
|
return parse(in, s.first_key, s.last_key);
|
|
}).then([&in, &s] {
|
|
|
|
in.seek(s.positions[0] + sizeof(summary::header));
|
|
|
|
assert(s.positions.size() == (s.entries.size() + 1));
|
|
|
|
auto idx = make_lw_shared<size_t>(0);
|
|
return do_for_each(s.entries.begin(), s.entries.end(), [idx, &in, &s] (auto& entry) {
|
|
auto pos = s.positions[(*idx)++];
|
|
auto next = s.positions[*idx];
|
|
|
|
auto entrysize = next - pos;
|
|
|
|
return in.read_exactly(entrysize).then([&entry, entrysize] (auto buf) {
|
|
check_buf_size(buf, entrysize);
|
|
|
|
auto keysize = entrysize - 8;
|
|
entry.key = bytes(reinterpret_cast<const int8_t*>(buf.get()), keysize);
|
|
buf.trim_front(keysize);
|
|
// FIXME: This is a le read. We should make this explicit
|
|
entry.position = *(reinterpret_cast<const net::packed<uint64_t> *>(buf.get()));
|
|
|
|
return make_ready_future<>();
|
|
});
|
|
}).then([&s] {
|
|
// Delete last element which isn't part of the on-disk format.
|
|
s.positions.pop_back();
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, summary_entry& entry) {
|
|
// FIXME: summary entry is supposedly written in memory order, but that
|
|
// would prevent portability of summary file between machines of different
|
|
// endianness. We can treat it as little endian to preserve portability.
|
|
write(out, entry.key);
|
|
auto p = reinterpret_cast<const char*>(&entry.position);
|
|
out.write(p, sizeof(uint64_t)).get();
|
|
}
|
|
|
|
inline void write(file_writer& out, summary& s) {
|
|
// NOTE: positions and entries must be stored in NATIVE BYTE ORDER, not BIG-ENDIAN.
|
|
write(out, s.header.min_index_interval,
|
|
s.header.size,
|
|
s.header.memory_size,
|
|
s.header.sampling_level,
|
|
s.header.size_at_full_sampling);
|
|
for (auto&& e : s.positions) {
|
|
out.write(reinterpret_cast<const char*>(&e), sizeof(e)).get();
|
|
}
|
|
write(out, s.entries);
|
|
write(out, s.first_key, s.last_key);
|
|
}
|
|
|
|
future<summary_entry&> sstable::read_summary_entry(size_t i) {
|
|
// The last one is the boundary marker
|
|
if (i >= (_summary.entries.size())) {
|
|
throw std::out_of_range(sprint("Invalid Summary index: %ld", i));
|
|
}
|
|
|
|
return make_ready_future<summary_entry&>(_summary.entries[i]);
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, deletion_time& d) {
|
|
return parse(in, d.local_deletion_time, d.marked_for_delete_at);
|
|
}
|
|
|
|
template <typename Child>
|
|
future<> parse(random_access_reader& in, std::unique_ptr<metadata>& p) {
|
|
p.reset(new Child);
|
|
return parse(in, *static_cast<Child *>(p.get()));
|
|
}
|
|
|
|
template <typename Child>
|
|
inline void write(file_writer& out, std::unique_ptr<metadata>& p) {
|
|
write(out, *static_cast<Child *>(p.get()));
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, statistics& s) {
|
|
return parse(in, s.hash).then([&in, &s] {
|
|
return do_for_each(s.hash.map.begin(), s.hash.map.end(), [&in, &s] (auto val) mutable {
|
|
in.seek(val.second);
|
|
|
|
switch (val.first) {
|
|
case metadata_type::Validation:
|
|
return parse<validation_metadata>(in, s.contents[val.first]);
|
|
case metadata_type::Compaction:
|
|
return parse<compaction_metadata>(in, s.contents[val.first]);
|
|
case metadata_type::Stats:
|
|
return parse<stats_metadata>(in, s.contents[val.first]);
|
|
default:
|
|
sstlog.warn("Invalid metadata type at Statistics file: {} ", int(val.first));
|
|
return make_ready_future<>();
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, statistics& s) {
|
|
write(out, s.hash);
|
|
struct kv {
|
|
metadata_type key;
|
|
uint32_t value;
|
|
};
|
|
// sort map by file offset value and store the result into a vector.
|
|
// this is indeed needed because output stream cannot afford random writes.
|
|
auto v = make_shared<std::vector<kv>>();
|
|
v->reserve(s.hash.map.size());
|
|
for (auto val : s.hash.map) {
|
|
kv tmp = { val.first, val.second };
|
|
v->push_back(tmp);
|
|
}
|
|
std::sort(v->begin(), v->end(), [] (kv i, kv j) { return i.value < j.value; });
|
|
for (auto& val: *v) {
|
|
switch (val.key) {
|
|
case metadata_type::Validation:
|
|
write<validation_metadata>(out, s.contents[val.key]);
|
|
break;
|
|
case metadata_type::Compaction:
|
|
write<compaction_metadata>(out, s.contents[val.key]);
|
|
break;
|
|
case metadata_type::Stats:
|
|
write<stats_metadata>(out, s.contents[val.key]);
|
|
break;
|
|
default:
|
|
sstlog.warn("Invalid metadata type at Statistics file: {} ", int(val.key));
|
|
return; // FIXME: should throw
|
|
}
|
|
}
|
|
}
|
|
|
|
future<> parse(random_access_reader& in, estimated_histogram& eh) {
|
|
auto len = std::make_unique<uint32_t>();
|
|
|
|
auto f = parse(in, *len);
|
|
return f.then([&in, &eh, len = std::move(len)] {
|
|
uint32_t length = *len;
|
|
|
|
if (length == 0) {
|
|
throw malformed_sstable_exception("Estimated histogram with zero size found. Can't continue!");
|
|
}
|
|
eh.bucket_offsets.resize(length - 1);
|
|
eh.buckets.resize(length);
|
|
|
|
auto type_size = sizeof(uint64_t) * 2;
|
|
return in.read_exactly(length * type_size).then([&eh, length, type_size] (auto buf) {
|
|
check_buf_size(buf, length * type_size);
|
|
|
|
auto *nr = reinterpret_cast<const net::packed<uint64_t> *>(buf.get());
|
|
size_t j = 0;
|
|
for (size_t i = 0; i < length; ++i) {
|
|
eh.bucket_offsets[i == 0 ? 0 : i - 1] = net::ntoh(nr[j++]);
|
|
eh.buckets[i] = net::ntoh(nr[j++]);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
});
|
|
}
|
|
|
|
inline void write(file_writer& out, estimated_histogram& eh) {
|
|
uint32_t len = 0;
|
|
check_truncate_and_assign(len, eh.buckets.size());
|
|
|
|
write(out, len);
|
|
struct element {
|
|
uint64_t offsets;
|
|
uint64_t buckets;
|
|
};
|
|
std::vector<element> elements;
|
|
elements.resize(eh.buckets.size());
|
|
|
|
auto *offsets_nr = reinterpret_cast<const net::packed<uint64_t> *>(eh.bucket_offsets.data());
|
|
auto *buckets_nr = reinterpret_cast<const net::packed<uint64_t> *>(eh.buckets.data());
|
|
for (size_t i = 0; i < eh.buckets.size(); i++) {
|
|
elements[i].offsets = net::hton(offsets_nr[i == 0 ? 0 : i - 1]);
|
|
elements[i].buckets = net::hton(buckets_nr[i]);
|
|
}
|
|
|
|
auto p = reinterpret_cast<const char*>(elements.data());
|
|
auto bytes = elements.size() * sizeof(element);
|
|
out.write(p, bytes).get();
|
|
}
|
|
|
|
// This is small enough, and well-defined. Easier to just read it all
|
|
// at once
|
|
future<> sstable::read_toc() {
|
|
if (_components.size()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto file_path = filename(sstable::component_type::TOC);
|
|
|
|
sstlog.debug("Reading TOC file {} ", file_path);
|
|
|
|
return open_checked_file_dma(sstable_read_error, file_path, open_flags::ro).then([this, file_path] (file f) {
|
|
auto bufptr = allocate_aligned_buffer<char>(4096, 4096);
|
|
auto buf = bufptr.get();
|
|
|
|
auto fut = f.dma_read(0, buf, 4096);
|
|
return std::move(fut).then([this, f = std::move(f), bufptr = std::move(bufptr), file_path] (size_t size) mutable {
|
|
// This file is supposed to be very small. Theoretically we should check its size,
|
|
// but if we so much as read a whole page from it, there is definitely something fishy
|
|
// going on - and this simplifies the code.
|
|
if (size >= 4096) {
|
|
throw malformed_sstable_exception("SSTable too big: " + to_sstring(size) + " bytes", file_path);
|
|
}
|
|
|
|
std::experimental::string_view buf(bufptr.get(), size);
|
|
std::vector<sstring> comps;
|
|
|
|
boost::split(comps , buf, boost::is_any_of("\n"));
|
|
|
|
for (auto& c: comps) {
|
|
// accept trailing newlines
|
|
if (c == "") {
|
|
continue;
|
|
}
|
|
try {
|
|
_components.insert(reverse_map(c, _component_map));
|
|
} catch (std::out_of_range& oor) {
|
|
_components.clear(); // so subsequent read_toc will be forced to fail again
|
|
throw malformed_sstable_exception("Unrecognized TOC component: " + c, file_path);
|
|
}
|
|
}
|
|
if (!_components.size()) {
|
|
throw malformed_sstable_exception("Empty TOC", file_path);
|
|
}
|
|
return f.close().finally([f] {});
|
|
});
|
|
}).then_wrapped([file_path] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (e.code() == std::error_code(ENOENT, std::system_category())) {
|
|
throw malformed_sstable_exception(file_path + ": file not found");
|
|
}
|
|
}
|
|
});
|
|
|
|
}
|
|
|
|
void sstable::generate_toc(compressor c, double filter_fp_chance) {
|
|
// Creating table of components.
|
|
_components.insert(component_type::TOC);
|
|
_components.insert(component_type::Statistics);
|
|
_components.insert(component_type::Digest);
|
|
_components.insert(component_type::Index);
|
|
_components.insert(component_type::Summary);
|
|
_components.insert(component_type::Data);
|
|
if (filter_fp_chance != 1.0) {
|
|
_components.insert(component_type::Filter);
|
|
}
|
|
if (c == compressor::none) {
|
|
_components.insert(component_type::CRC);
|
|
} else {
|
|
_components.insert(component_type::CompressionInfo);
|
|
}
|
|
}
|
|
|
|
void sstable::write_toc(const io_priority_class& pc) {
|
|
auto file_path = filename(sstable::component_type::TemporaryTOC);
|
|
|
|
sstlog.debug("Writing TOC file {} ", file_path);
|
|
|
|
// Writing TOC content to temporary file.
|
|
// If creation of temporary TOC failed, it implies that that boot failed to
|
|
// delete a sstable with temporary for this column family, or there is a
|
|
// sstable being created in parallel with the same generation.
|
|
file f = new_sstable_component_file(sstable_write_error, file_path, open_flags::wo | open_flags::create | open_flags::exclusive).get0();
|
|
|
|
bool toc_exists = file_exists(filename(sstable::component_type::TOC)).get0();
|
|
if (toc_exists) {
|
|
// TOC will exist at this point if write_components() was called with
|
|
// the generation of a sstable that exists.
|
|
f.close().get();
|
|
remove_file(file_path).get();
|
|
throw std::runtime_error(sprint("SSTable write failed due to existence of TOC file for generation %ld of %s.%s", _generation, _ks, _cf));
|
|
}
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = 4096;
|
|
options.io_priority_class = pc;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
|
|
for (auto&& key : _components) {
|
|
// new line character is appended to the end of each component name.
|
|
auto value = _component_map[key] + "\n";
|
|
bytes b = bytes(reinterpret_cast<const bytes::value_type *>(value.c_str()), value.size());
|
|
write(w, b);
|
|
}
|
|
w.flush().get();
|
|
w.close().get();
|
|
|
|
// Flushing parent directory to guarantee that temporary TOC file reached
|
|
// the disk.
|
|
file dir_f = open_checked_directory(sstable_write_error, _dir).get0();
|
|
sstable_write_io_check([&] {
|
|
dir_f.flush().get();
|
|
dir_f.close().get();
|
|
});
|
|
}
|
|
|
|
future<> sstable::seal_sstable() {
|
|
// SSTable sealing is about renaming temporary TOC file after guaranteeing
|
|
// that each component reached the disk safely.
|
|
return open_checked_directory(sstable_write_error, _dir).then([this] (file dir_f) {
|
|
// Guarantee that every component of this sstable reached the disk.
|
|
return sstable_write_io_check([&] { return dir_f.flush(); }).then([this] {
|
|
// Rename TOC because it's no longer temporary.
|
|
return sstable_write_io_check([&] {
|
|
return engine().rename_file(filename(sstable::component_type::TemporaryTOC), filename(sstable::component_type::TOC));
|
|
});
|
|
}).then([this, dir_f] () mutable {
|
|
// Guarantee that the changes above reached the disk.
|
|
return sstable_write_io_check([&] { return dir_f.flush(); });
|
|
}).then([this, dir_f] () mutable {
|
|
return sstable_write_io_check([&] { return dir_f.close(); });
|
|
}).then([this, dir_f] {
|
|
// If this point was reached, sstable should be safe in disk.
|
|
sstlog.debug("SSTable with generation {} of {}.{} was sealed successfully.", _generation, _ks, _cf);
|
|
});
|
|
});
|
|
}
|
|
|
|
void write_crc(const sstring file_path, checksum& c) {
|
|
sstlog.debug("Writing CRC file {} ", file_path);
|
|
|
|
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
|
file f = new_sstable_component_file(sstable_write_error, file_path, oflags).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = 4096;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
write(w, c);
|
|
w.close().get();
|
|
}
|
|
|
|
// Digest file stores the full checksum of data file converted into a string.
|
|
void write_digest(const sstring file_path, uint32_t full_checksum) {
|
|
sstlog.debug("Writing Digest file {} ", file_path);
|
|
|
|
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
|
auto f = new_sstable_component_file(sstable_write_error, file_path, oflags).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = 4096;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
|
|
auto digest = to_sstring<bytes>(full_checksum);
|
|
write(w, digest);
|
|
w.close().get();
|
|
}
|
|
|
|
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_sample_pattern_cache;
|
|
thread_local std::array<std::vector<int>, downsampling::BASE_SAMPLING_LEVEL> downsampling::_original_index_cache;
|
|
|
|
future<index_list> sstable::read_indexes(uint64_t summary_idx, const io_priority_class& pc) {
|
|
if (summary_idx >= _summary.header.size) {
|
|
return make_ready_future<index_list>(index_list());
|
|
}
|
|
|
|
uint64_t position = _summary.entries[summary_idx].position;
|
|
uint64_t quantity = downsampling::get_effective_index_interval_after_index(summary_idx, _summary.header.sampling_level,
|
|
_summary.header.min_index_interval);
|
|
|
|
uint64_t end;
|
|
if (++summary_idx >= _summary.header.size) {
|
|
end = index_size();
|
|
} else {
|
|
end = _summary.entries[summary_idx].position;
|
|
}
|
|
|
|
return do_with(index_consumer(quantity), [this, position, end, &pc] (index_consumer& ic) {
|
|
file_input_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto stream = make_file_input_stream(this->_index_file, position, end - position, std::move(options));
|
|
// TODO: it's redundant to constrain the consumer here to stop at
|
|
// index_size()-position, the input stream is already constrained.
|
|
auto ctx = make_lw_shared<index_consume_entry_context<index_consumer>>(ic, std::move(stream), this->index_size() - position);
|
|
return ctx->consume_input(*ctx).finally([ctx] {
|
|
return ctx->close();
|
|
}).then([ctx, &ic] {
|
|
return make_ready_future<index_list>(std::move(ic.indexes));
|
|
});
|
|
});
|
|
}
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
future<> sstable::read_simple(T& component, const io_priority_class& pc) {
|
|
|
|
auto file_path = filename(Type);
|
|
sstlog.debug(("Reading " + _component_map[Type] + " file {} ").c_str(), file_path);
|
|
return open_file_dma(file_path, open_flags::ro).then([this, &component] (file fi) {
|
|
auto f = make_checked_file(sstable_read_error, fi);
|
|
auto r = make_lw_shared<file_random_access_reader>(std::move(f), sstable_buffer_size);
|
|
auto fut = parse(*r, component);
|
|
return fut.finally([r = std::move(r)] {
|
|
return r->close();
|
|
}).then([r] {});
|
|
}).then_wrapped([this, file_path] (future<> f) {
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (e.code() == std::error_code(ENOENT, std::system_category())) {
|
|
throw malformed_sstable_exception(file_path + ": file not found");
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
template <sstable::component_type Type, typename T>
|
|
void sstable::write_simple(T& component, const io_priority_class& pc) {
|
|
auto file_path = filename(Type);
|
|
sstlog.debug(("Writing " + _component_map[Type] + " file {} ").c_str(), file_path);
|
|
file f = new_sstable_component_file(sstable_write_error, file_path, open_flags::wo | open_flags::create | open_flags::exclusive).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
write(w, component);
|
|
w.flush().get();
|
|
w.close().get();
|
|
}
|
|
|
|
template future<> sstable::read_simple<sstable::component_type::Filter>(sstables::filter& f, const io_priority_class& pc);
|
|
template void sstable::write_simple<sstable::component_type::Filter>(sstables::filter& f, const io_priority_class& pc);
|
|
|
|
future<> sstable::read_compression(const io_priority_class& pc) {
|
|
// FIXME: If there is no compression, we should expect a CRC file to be present.
|
|
if (!has_component(sstable::component_type::CompressionInfo)) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return read_simple<component_type::CompressionInfo>(_compression, pc);
|
|
}
|
|
|
|
void sstable::write_compression(const io_priority_class& pc) {
|
|
if (!has_component(sstable::component_type::CompressionInfo)) {
|
|
return;
|
|
}
|
|
|
|
write_simple<component_type::CompressionInfo>(_compression, pc);
|
|
}
|
|
|
|
future<> sstable::read_statistics(const io_priority_class& pc) {
|
|
return read_simple<component_type::Statistics>(_statistics, pc);
|
|
}
|
|
|
|
void sstable::write_statistics(const io_priority_class& pc) {
|
|
write_simple<component_type::Statistics>(_statistics, pc);
|
|
}
|
|
|
|
void sstable::rewrite_statistics(const io_priority_class& pc) {
|
|
auto file_path = filename(component_type::TemporaryStatistics);
|
|
sstlog.debug("Rewriting statistics component of sstable {}", get_filename());
|
|
file f = new_sstable_component_file(sstable_write_error, file_path, open_flags::wo | open_flags::create | open_flags::truncate).get0();
|
|
|
|
file_output_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto w = file_writer(std::move(f), std::move(options));
|
|
write(w, _statistics);
|
|
w.flush().get();
|
|
w.close().get();
|
|
// rename() guarantees atomicity when renaming a file into place.
|
|
sstable_write_io_check(rename_file, file_path, filename(component_type::Statistics)).get();
|
|
}
|
|
|
|
future<> sstable::read_summary(const io_priority_class& pc) {
|
|
if (_summary) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
return read_toc().then([this, &pc] {
|
|
// We'll try to keep the main code path exception free, but if an exception does happen
|
|
// we can try to regenerate the Summary.
|
|
if (has_component(sstable::component_type::Summary)) {
|
|
return read_simple<component_type::Summary>(_summary, pc).handle_exception([this, &pc] (auto ep) {
|
|
sstlog.warn("Couldn't read summary file %s: %s. Recreating it.", this->filename(component_type::Summary), ep);
|
|
return this->generate_summary(pc);
|
|
});
|
|
} else {
|
|
return generate_summary(pc);
|
|
}
|
|
});
|
|
}
|
|
|
|
future<> sstable::open_data() {
|
|
return when_all(open_checked_file_dma(sstable_read_error, filename(component_type::Index), open_flags::ro),
|
|
open_checked_file_dma(sstable_read_error, filename(component_type::Data), open_flags::ro))
|
|
.then([this] (auto files) {
|
|
_index_file = std::get<file>(std::get<0>(files).get());
|
|
_data_file = std::get<file>(std::get<1>(files).get());
|
|
return _data_file.size().then([this] (auto size) {
|
|
if (this->has_component(sstable::component_type::CompressionInfo)) {
|
|
_compression.update(size);
|
|
} else {
|
|
_data_file_size = size;
|
|
}
|
|
}).then([this] {
|
|
return _index_file.size().then([this] (auto size) {
|
|
_index_file_size = size;
|
|
});
|
|
}).then([this] {
|
|
// Get disk usage for this sstable (includes all components).
|
|
_bytes_on_disk = 0;
|
|
return do_for_each(_components, [this] (component_type c) {
|
|
return sstable_write_io_check([&] {
|
|
return engine().file_size(this->filename(c));
|
|
}).then([this] (uint64_t bytes) {
|
|
_bytes_on_disk += bytes;
|
|
});
|
|
});
|
|
});
|
|
|
|
});
|
|
}
|
|
|
|
future<> sstable::create_data() {
|
|
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
|
file_open_options opt;
|
|
opt.extent_allocation_size_hint = 32 << 20;
|
|
return when_all(new_sstable_component_file(sstable_write_error, filename(component_type::Index), oflags),
|
|
new_sstable_component_file(sstable_write_error, filename(component_type::Data), oflags, opt)).then([this] (auto files) {
|
|
// FIXME: If both files could not be created, the first get below will
|
|
// throw an exception, and second get() will not be attempted, and
|
|
// we'll get a warning about the second future being destructed
|
|
// without its exception being examined.
|
|
_index_file = std::get<file>(std::get<0>(files).get());
|
|
_data_file = std::get<file>(std::get<1>(files).get());
|
|
});
|
|
}
|
|
|
|
// This interface is only used during tests, snapshot loading and early initialization.
|
|
// No need to set tunable priorities for it.
|
|
future<> sstable::load() {
|
|
return read_toc().then([this] {
|
|
return read_statistics(default_priority_class());
|
|
}).then([this] {
|
|
return read_compression(default_priority_class());
|
|
}).then([this] {
|
|
return read_filter(default_priority_class());
|
|
}).then([this] {;
|
|
return read_summary(default_priority_class());
|
|
}).then([this] {
|
|
return open_data();
|
|
});
|
|
}
|
|
|
|
static void output_promoted_index_entry(bytes_ostream& promoted_index,
|
|
const bytes& first_col,
|
|
const bytes& last_col,
|
|
uint64_t offset, uint64_t width) {
|
|
char s[2];
|
|
write_be(s, uint16_t(first_col.size()));
|
|
promoted_index.write(s, 2);
|
|
promoted_index.write(first_col);
|
|
write_be(s, uint16_t(last_col.size()));
|
|
promoted_index.write(s, 2);
|
|
promoted_index.write(last_col);
|
|
char q[8];
|
|
write_be(q, uint64_t(offset));
|
|
promoted_index.write(q, 8);
|
|
write_be(q, uint64_t(width));
|
|
promoted_index.write(q, 8);
|
|
}
|
|
|
|
// FIXME: use this in write_column_name() instead of repeating the code
|
|
static bytes serialize_colname(const composite& clustering_key,
|
|
const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
|
auto c = composite::from_exploded(column_names, marker);
|
|
auto ck_bview = bytes_view(clustering_key);
|
|
// The marker is not a component, so if the last component is empty (IOW,
|
|
// only serializes to the marker), then we just replace the key's last byte
|
|
// with the marker. If the component however it is not empty, then the
|
|
// marker should be in the end of it, and we just join them together as we
|
|
// do for any normal component
|
|
if (c.size() == 1) {
|
|
ck_bview.remove_suffix(1);
|
|
}
|
|
size_t sz = ck_bview.size() + c.size();
|
|
if (sz > std::numeric_limits<uint16_t>::max()) {
|
|
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
|
}
|
|
bytes colname(bytes::initialized_later(), sz);
|
|
std::copy(ck_bview.begin(), ck_bview.end(), colname.begin());
|
|
std::copy(c.get_bytes().begin(), c.get_bytes().end(), colname.begin() + ck_bview.size());
|
|
return colname;
|
|
}
|
|
|
|
// Call maybe_flush_pi_block() before writing the given sstable atom to the
|
|
// output. This may start a new promoted-index block depending on how much
|
|
// data we've already written since the start of the current block. Starting
|
|
// a new block involves both outputting the range of the old block to the
|
|
// index file, and outputting again the currently-open range tombstones to
|
|
// the data file.
|
|
// TODO: currently, maybe_flush_pi_block serializes the column name on every
|
|
// call, saving it in _pi_write.block_last_colname which we need for closing
|
|
// each block, as well as for closing the last block. We could instead save
|
|
// just the unprocessed arguments, and serialize them only when needed at the
|
|
// end of the block. For this we would need this function to take rvalue
|
|
// references (so data is moved in), and need not to use vector of byte_view
|
|
// (which might be gone later).
|
|
void sstable::maybe_flush_pi_block(file_writer& out,
|
|
const composite& clustering_key,
|
|
const std::vector<bytes_view>& column_names) {
|
|
bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none);
|
|
if (_pi_write.block_first_colname.empty()) {
|
|
// This is the first column in the partition, or first column since we
|
|
// closed a promoted-index block. Remember its name and position -
|
|
// we'll need to write it to the promoted index.
|
|
_pi_write.block_start_offset = out.offset();
|
|
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
|
_pi_write.block_first_colname = colname;
|
|
_pi_write.block_last_colname = std::move(colname);
|
|
} else if (out.offset() >= _pi_write.block_next_start_offset) {
|
|
// If we wrote enough bytes to the partition since we output a sample
|
|
// to the promoted index, output one now and start a new one.
|
|
output_promoted_index_entry(_pi_write.data,
|
|
_pi_write.block_first_colname,
|
|
_pi_write.block_last_colname,
|
|
_pi_write.block_start_offset - _c_stats.start_offset,
|
|
out.offset() - _pi_write.block_start_offset);
|
|
_pi_write.numblocks++;
|
|
_pi_write.block_start_offset = out.offset();
|
|
// Because the new block can be read without the previous blocks, we
|
|
// need to repeat the range tombstones which are still open.
|
|
// Note that block_start_offset is before outputting those (so the new
|
|
// block includes them), but we set block_next_start_offset after - so
|
|
// even if we wrote a lot of open tombstones, we still get a full
|
|
// block size of new data.
|
|
if (!clustering_key.empty()) {
|
|
auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
|
clustering_key_prefix(clustering_key.values()));
|
|
for (const auto& rt : rts) {
|
|
auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
|
|
auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
|
|
write_range_tombstone(out,
|
|
start, rt.start_kind, end, rt.end_kind, {}, rt.tomb);
|
|
}
|
|
}
|
|
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
|
_pi_write.block_first_colname = colname;
|
|
_pi_write.block_last_colname = std::move(colname);
|
|
} else {
|
|
// Keep track of the last column in the partition - we'll need it to close
|
|
// the last block in the promoted index, unfortunately.
|
|
_pi_write.block_last_colname = std::move(colname);
|
|
}
|
|
}
|
|
|
|
// @clustering_key: it's expected that clustering key is already in its composite form.
|
|
// NOTE: empty clustering key means that there is no clustering key.
|
|
void sstable::write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
|
// FIXME: min_components and max_components also keep track of clustering
|
|
// prefix, so we must merge clustering_key and column_names somehow and
|
|
// pass the result to the functions below.
|
|
column_name_helper::min_max_components(_c_stats.min_column_names, _c_stats.max_column_names, column_names);
|
|
|
|
// was defined in the schema, for example.
|
|
auto c = composite::from_exploded(column_names, marker);
|
|
auto ck_bview = bytes_view(clustering_key);
|
|
|
|
// The marker is not a component, so if the last component is empty (IOW,
|
|
// only serializes to the marker), then we just replace the key's last byte
|
|
// with the marker. If the component however it is not empty, then the
|
|
// marker should be in the end of it, and we just join them together as we
|
|
// do for any normal component
|
|
if (c.size() == 1) {
|
|
ck_bview.remove_suffix(1);
|
|
}
|
|
size_t sz = ck_bview.size() + c.size();
|
|
if (sz > std::numeric_limits<uint16_t>::max()) {
|
|
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
|
}
|
|
uint16_t sz16 = sz;
|
|
write(out, sz16, ck_bview, c);
|
|
}
|
|
|
|
void sstable::write_column_name(file_writer& out, bytes_view column_names) {
|
|
column_name_helper::min_max_components(_c_stats.min_column_names, _c_stats.max_column_names, { column_names });
|
|
|
|
size_t sz = column_names.size();
|
|
if (sz > std::numeric_limits<uint16_t>::max()) {
|
|
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
|
}
|
|
uint16_t sz16 = sz;
|
|
write(out, sz16, column_names);
|
|
}
|
|
|
|
|
|
static inline void update_cell_stats(column_stats& c_stats, uint64_t timestamp) {
|
|
c_stats.update_min_timestamp(timestamp);
|
|
c_stats.update_max_timestamp(timestamp);
|
|
c_stats.column_count++;
|
|
}
|
|
|
|
// Intended to write all cell components that follow column name.
|
|
void sstable::write_cell(file_writer& out, atomic_cell_view cell) {
|
|
// FIXME: counter cell isn't supported yet.
|
|
|
|
uint64_t timestamp = cell.timestamp();
|
|
|
|
update_cell_stats(_c_stats, timestamp);
|
|
|
|
if (cell.is_dead(_now)) {
|
|
// tombstone cell
|
|
|
|
column_mask mask = column_mask::deletion;
|
|
uint32_t deletion_time_size = sizeof(uint32_t);
|
|
uint32_t deletion_time = cell.deletion_time().time_since_epoch().count();
|
|
|
|
_c_stats.update_max_local_deletion_time(deletion_time);
|
|
_c_stats.tombstone_histogram.update(deletion_time);
|
|
|
|
write(out, mask, timestamp, deletion_time_size, deletion_time);
|
|
} else if (cell.is_live_and_has_ttl()) {
|
|
// expiring cell
|
|
|
|
column_mask mask = column_mask::expiration;
|
|
uint32_t ttl = cell.ttl().count();
|
|
uint32_t expiration = cell.expiry().time_since_epoch().count();
|
|
disk_string_view<uint32_t> cell_value { cell.value() };
|
|
|
|
_c_stats.update_max_local_deletion_time(expiration);
|
|
|
|
write(out, mask, ttl, expiration, timestamp, cell_value);
|
|
} else {
|
|
// regular cell
|
|
|
|
column_mask mask = column_mask::none;
|
|
disk_string_view<uint32_t> cell_value { cell.value() };
|
|
|
|
_c_stats.update_max_local_deletion_time(std::numeric_limits<int>::max());
|
|
|
|
write(out, mask, timestamp, cell_value);
|
|
}
|
|
}
|
|
|
|
void sstable::write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key) {
|
|
if (marker.is_missing()) {
|
|
return;
|
|
}
|
|
|
|
// Write row mark cell to the beginning of clustered row.
|
|
write_column_name(out, clustering_key, { bytes_view() });
|
|
uint64_t timestamp = marker.timestamp();
|
|
uint32_t value_length = 0;
|
|
|
|
update_cell_stats(_c_stats, timestamp);
|
|
|
|
if (marker.is_dead(_now)) {
|
|
column_mask mask = column_mask::deletion;
|
|
uint32_t deletion_time_size = sizeof(uint32_t);
|
|
uint32_t deletion_time = marker.deletion_time().time_since_epoch().count();
|
|
|
|
_c_stats.tombstone_histogram.update(deletion_time);
|
|
|
|
write(out, mask, timestamp, deletion_time_size, deletion_time);
|
|
} else if (marker.is_expiring()) {
|
|
column_mask mask = column_mask::expiration;
|
|
uint32_t ttl = marker.ttl().count();
|
|
uint32_t expiration = marker.expiry().time_since_epoch().count();
|
|
write(out, mask, ttl, expiration, timestamp, value_length);
|
|
} else {
|
|
column_mask mask = column_mask::none;
|
|
write(out, mask, timestamp, value_length);
|
|
}
|
|
}
|
|
|
|
void sstable::write_range_tombstone(file_writer& out,
|
|
const composite& start,
|
|
bound_kind start_kind,
|
|
const composite& end,
|
|
bound_kind end_kind,
|
|
std::vector<bytes_view> suffix,
|
|
const tombstone t) {
|
|
if (!t) {
|
|
return;
|
|
}
|
|
|
|
auto start_marker = start_kind == bound_kind::excl_start
|
|
? composite::eoc::end
|
|
: composite::eoc::start;
|
|
write_column_name(out, start, suffix, start_marker);
|
|
column_mask mask = column_mask::range_tombstone;
|
|
write(out, mask);
|
|
auto end_marker = end_kind == bound_kind::excl_end
|
|
? composite::eoc::start
|
|
: composite::eoc::end;
|
|
write_column_name(out, end, suffix, end_marker);
|
|
uint64_t timestamp = t.timestamp;
|
|
uint32_t deletion_time = t.deletion_time.time_since_epoch().count();
|
|
|
|
update_cell_stats(_c_stats, timestamp);
|
|
_c_stats.update_max_local_deletion_time(deletion_time);
|
|
_c_stats.tombstone_histogram.update(deletion_time);
|
|
|
|
write(out, deletion_time, timestamp);
|
|
}
|
|
|
|
void sstable::write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection) {
|
|
|
|
auto t = static_pointer_cast<const collection_type_impl>(cdef.type);
|
|
auto mview = t->deserialize_mutation_form(collection);
|
|
const bytes& column_name = cdef.name();
|
|
write_range_tombstone(out, clustering_key, clustering_key, { bytes_view(column_name) }, mview.tomb);
|
|
for (auto& cp: mview.cells) {
|
|
maybe_flush_pi_block(out, clustering_key, { column_name, cp.first });
|
|
write_column_name(out, clustering_key, { column_name, cp.first });
|
|
write_cell(out, cp.second);
|
|
}
|
|
}
|
|
|
|
// write_datafile_clustered_row() is about writing a clustered_row to data file according to SSTables format.
|
|
// clustered_row contains a set of cells sharing the same clustering key.
|
|
void sstable::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
|
|
auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());
|
|
|
|
if (schema.is_compound() && !schema.is_dense()) {
|
|
maybe_flush_pi_block(out, clustering_key, { bytes_view() });
|
|
write_row_marker(out, clustered_row.marker(), clustering_key);
|
|
}
|
|
// Before writing cells, range tombstone must be written if the row has any (deletable_row::t).
|
|
if (clustered_row.tomb()) {
|
|
maybe_flush_pi_block(out, clustering_key, {});
|
|
write_range_tombstone(out, clustering_key, clustering_key, {}, clustered_row.tomb());
|
|
// Because we currently may break a partition to promoted-index blocks
|
|
// in the middle of a clustered row, we also need to track the current
|
|
// row's tombstone - not just range tombstones - which may effect the
|
|
// beginning of a new block.
|
|
// TODO: consider starting a new block only between rows, so the
|
|
// following code can be dropped:
|
|
_pi_write.tombstone_accumulator->apply(range_tombstone(
|
|
clustered_row.key(), bound_kind::incl_start,
|
|
clustered_row.key(), bound_kind::incl_end, clustered_row.tomb()));
|
|
}
|
|
|
|
// Write all cells of a partition's row.
|
|
clustered_row.cells().for_each_cell([&] (column_id id, const atomic_cell_or_collection& c) {
|
|
auto&& column_definition = schema.regular_column_at(id);
|
|
// non atomic cell isn't supported yet. atomic cell maps to a single trift cell.
|
|
// non atomic cell maps to multiple trift cell, e.g. collection.
|
|
if (!column_definition.is_atomic()) {
|
|
write_collection(out, clustering_key, column_definition, c.as_collection_mutation());
|
|
return;
|
|
}
|
|
assert(column_definition.is_regular());
|
|
atomic_cell_view cell = c.as_atomic_cell();
|
|
const bytes& column_name = column_definition.name();
|
|
|
|
if (schema.is_compound()) {
|
|
if (schema.is_dense()) {
|
|
maybe_flush_pi_block(out, composite(), { bytes_view(clustering_key) });
|
|
write_column_name(out, bytes_view(clustering_key));
|
|
} else {
|
|
maybe_flush_pi_block(out, clustering_key, { bytes_view(column_name) });
|
|
write_column_name(out, clustering_key, { bytes_view(column_name) });
|
|
}
|
|
} else {
|
|
if (schema.is_dense()) {
|
|
maybe_flush_pi_block(out, composite(), { bytes_view(clustered_row.key().get_component(schema, 0)) });
|
|
write_column_name(out, bytes_view(clustered_row.key().get_component(schema, 0)));
|
|
} else {
|
|
maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
|
|
write_column_name(out, bytes_view(column_name));
|
|
}
|
|
}
|
|
write_cell(out, cell);
|
|
});
|
|
}
|
|
|
|
void sstable::write_static_row(file_writer& out, const schema& schema, const row& static_row) {
|
|
static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& c) {
|
|
auto&& column_definition = schema.static_column_at(id);
|
|
if (!column_definition.is_atomic()) {
|
|
auto sp = composite::static_prefix(schema);
|
|
write_collection(out, sp, column_definition, c.as_collection_mutation());
|
|
return;
|
|
}
|
|
assert(column_definition.is_static());
|
|
atomic_cell_view cell = c.as_atomic_cell();
|
|
auto sp = composite::static_prefix(schema);
|
|
maybe_flush_pi_block(out, sp, { bytes_view(column_definition.name()) });
|
|
write_column_name(out, sp, { bytes_view(column_definition.name()) });
|
|
write_cell(out, cell);
|
|
});
|
|
}
|
|
|
|
static void write_index_header(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
|
|
write(out, key, pos);
|
|
}
|
|
|
|
static void write_index_promoted(file_writer& out, bytes_ostream& promoted_index,
|
|
deletion_time deltime, uint32_t numblocks) {
|
|
uint32_t promoted_index_size = promoted_index.size();
|
|
if (promoted_index_size) {
|
|
promoted_index_size += 16 /* deltime + numblocks */;
|
|
write(out, promoted_index_size, deltime, numblocks, promoted_index);
|
|
} else {
|
|
write(out, promoted_index_size);
|
|
}
|
|
}
|
|
|
|
static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
|
|
assert(expected_partition_count >= 1);
|
|
|
|
s.header.min_index_interval = min_index_interval;
|
|
s.header.sampling_level = downsampling::BASE_SAMPLING_LEVEL;
|
|
uint64_t max_expected_entries =
|
|
(expected_partition_count / min_index_interval) +
|
|
!!(expected_partition_count % min_index_interval);
|
|
// FIXME: handle case where max_expected_entries is greater than max value stored by uint32_t.
|
|
if (max_expected_entries > std::numeric_limits<uint32_t>::max()) {
|
|
throw malformed_sstable_exception("Current sampling level (" + to_sstring(downsampling::BASE_SAMPLING_LEVEL) + ") not enough to generate summary.");
|
|
}
|
|
|
|
s.keys_written = 0;
|
|
s.header.memory_size = 0;
|
|
}
|
|
|
|
static void seal_summary(summary& s,
|
|
std::experimental::optional<key>&& first_key,
|
|
std::experimental::optional<key>&& last_key) {
|
|
s.header.size = s.entries.size();
|
|
s.header.size_at_full_sampling = s.header.size;
|
|
|
|
s.header.memory_size = s.header.size * sizeof(uint32_t);
|
|
for (auto& e: s.entries) {
|
|
s.positions.push_back(s.header.memory_size);
|
|
s.header.memory_size += e.key.size() + sizeof(e.position);
|
|
}
|
|
assert(first_key); // assume non-empty sstable
|
|
s.first_key.value = first_key->get_bytes();
|
|
|
|
if (last_key) {
|
|
s.last_key.value = last_key->get_bytes();
|
|
} else {
|
|
// An empty last_mutation indicates we had just one partition
|
|
s.last_key.value = s.first_key.value;
|
|
}
|
|
}
|
|
|
|
static void prepare_compression(compression& c, const schema& schema) {
|
|
const auto& cp = schema.get_compressor_params();
|
|
c.set_compressor(cp.get_compressor());
|
|
c.chunk_len = cp.chunk_length();
|
|
c.data_len = 0;
|
|
// FIXME: crc_check_chance can be configured by the user.
|
|
// probability to verify the checksum of a compressed chunk we read.
|
|
// defaults to 1.0.
|
|
c.options.elements.push_back({"crc_check_chance", "1.0"});
|
|
c.init_full_checksum();
|
|
}
|
|
|
|
static void maybe_add_summary_entry(summary& s, bytes_view key, uint64_t offset) {
|
|
// Maybe add summary entry into in-memory representation of summary file.
|
|
if ((s.keys_written++ % s.header.min_index_interval) == 0) {
|
|
s.entries.push_back({ bytes(key.data(), key.size()), offset });
|
|
}
|
|
}
|
|
|
|
// In the beginning of the statistics file, there is a disk_hash used to
|
|
// map each metadata type to its correspondent position in the file.
|
|
static void seal_statistics(statistics& s, metadata_collector& collector,
|
|
const sstring partitioner, double bloom_filter_fp_chance) {
|
|
static constexpr int METADATA_TYPE_COUNT = 3;
|
|
|
|
size_t old_offset, offset = 0;
|
|
// account disk_hash size.
|
|
offset += sizeof(uint32_t);
|
|
// account disk_hash members.
|
|
offset += (METADATA_TYPE_COUNT * (sizeof(metadata_type) + sizeof(uint32_t)));
|
|
|
|
validation_metadata validation;
|
|
compaction_metadata compaction;
|
|
stats_metadata stats;
|
|
|
|
old_offset = offset;
|
|
validation.partitioner.value = to_bytes(partitioner);
|
|
validation.filter_chance = bloom_filter_fp_chance;
|
|
offset += validation.serialized_size();
|
|
s.contents[metadata_type::Validation] = std::make_unique<validation_metadata>(std::move(validation));
|
|
s.hash.map[metadata_type::Validation] = old_offset;
|
|
|
|
old_offset = offset;
|
|
collector.construct_compaction(compaction);
|
|
offset += compaction.serialized_size();
|
|
s.contents[metadata_type::Compaction] = std::make_unique<compaction_metadata>(std::move(compaction));
|
|
s.hash.map[metadata_type::Compaction] = old_offset;
|
|
|
|
collector.construct_stats(stats);
|
|
// NOTE: method serialized_size of stats_metadata must be implemented for
|
|
// a new type of compaction to get supported.
|
|
s.contents[metadata_type::Stats] = std::make_unique<stats_metadata>(std::move(stats));
|
|
s.hash.map[metadata_type::Stats] = offset;
|
|
}
|
|
|
|
// Returns offset into data component.
|
|
size_t components_writer::get_offset() {
|
|
if (_sst.has_component(sstable::component_type::CompressionInfo)) {
|
|
// Variable returned by compressed_file_length() is constantly updated by compressed output stream.
|
|
return _sst._compression.compressed_file_length();
|
|
} else {
|
|
return _out.offset();
|
|
}
|
|
}
|
|
|
|
file_writer components_writer::index_file_writer(sstable& sst, const io_priority_class& pc) {
|
|
file_output_stream_options options;
|
|
options.buffer_size = sst.sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
return file_writer(sst._index_file, std::move(options));
|
|
}
|
|
|
|
// Get the currently loaded configuration, or the default configuration in
|
|
// case none has been loaded (this happens, for example, in unit tests).
|
|
static const db::config& get_config() {
|
|
if (service::get_storage_service().local_is_initialized() &&
|
|
service::get_local_storage_service().db().local_is_initialized()) {
|
|
return service::get_local_storage_service().db().local().get_config();
|
|
} else {
|
|
static db::config default_config;
|
|
return default_config;
|
|
}
|
|
}
|
|
|
|
components_writer::components_writer(sstable& sst, const schema& s, file_writer& out,
|
|
uint64_t estimated_partitions, uint64_t max_sstable_size,
|
|
const io_priority_class& pc)
|
|
: _sst(sst)
|
|
, _schema(s)
|
|
, _out(out)
|
|
, _index(index_file_writer(sst, pc))
|
|
, _max_sstable_size(max_sstable_size)
|
|
, _tombstone_written(false)
|
|
{
|
|
_sst._filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
|
|
_sst._pi_write.desired_block_size = get_config().column_index_size_in_kb() * 1024;
|
|
|
|
prepare_summary(_sst._summary, estimated_partitions, _schema.min_index_interval());
|
|
|
|
// FIXME: we may need to set repaired_at stats at this point.
|
|
}
|
|
|
|
void components_writer::consume_new_partition(const dht::decorated_key& dk) {
|
|
// Set current index of data to later compute row size.
|
|
_sst._c_stats.start_offset = _out.offset();
|
|
|
|
_partition_key = key::from_partition_key(_schema, dk.key());
|
|
|
|
maybe_add_summary_entry(_sst._summary, bytes_view(*_partition_key), _index.offset());
|
|
_sst._filter->add(bytes_view(*_partition_key));
|
|
_sst._collector.add_key(bytes_view(*_partition_key));
|
|
|
|
auto p_key = disk_string_view<uint16_t>();
|
|
p_key.value = bytes_view(*_partition_key);
|
|
|
|
// Write index file entry from partition key into index file.
|
|
// Write an index entry minus the "promoted index" (sample of columns)
|
|
// part. We can only write that after processing the entire partition
|
|
// and collecting the sample of columns.
|
|
write_index_header(_index, p_key, _out.offset());
|
|
_sst._pi_write.data = {};
|
|
_sst._pi_write.numblocks = 0;
|
|
_sst._pi_write.deltime.local_deletion_time = std::numeric_limits<int32_t>::max();
|
|
_sst._pi_write.deltime.marked_for_delete_at = std::numeric_limits<int64_t>::min();
|
|
_sst._pi_write.block_start_offset = _out.offset();
|
|
_sst._pi_write.tombstone_accumulator = range_tombstone_accumulator(_schema, false);
|
|
_sst._pi_write.schemap = &_schema; // sadly we need this
|
|
|
|
// Write partition key into data file.
|
|
write(_out, p_key);
|
|
|
|
_tombstone_written = false;
|
|
}
|
|
|
|
void components_writer::consume(tombstone t) {
|
|
deletion_time d;
|
|
|
|
if (t) {
|
|
d.local_deletion_time = t.deletion_time.time_since_epoch().count();
|
|
d.marked_for_delete_at = t.timestamp;
|
|
|
|
_sst._c_stats.tombstone_histogram.update(d.local_deletion_time);
|
|
_sst._c_stats.update_max_local_deletion_time(d.local_deletion_time);
|
|
_sst._c_stats.update_min_timestamp(d.marked_for_delete_at);
|
|
_sst._c_stats.update_max_timestamp(d.marked_for_delete_at);
|
|
} else {
|
|
// Default values for live, undeleted rows.
|
|
d.local_deletion_time = std::numeric_limits<int32_t>::max();
|
|
d.marked_for_delete_at = std::numeric_limits<int64_t>::min();
|
|
}
|
|
write(_out, d);
|
|
_tombstone_written = true;
|
|
// TODO: need to verify we don't do this twice?
|
|
_sst._pi_write.deltime = d;
|
|
}
|
|
|
|
stop_iteration components_writer::consume(static_row&& sr) {
|
|
ensure_tombstone_is_written();
|
|
_sst.write_static_row(_out, _schema, sr.cells());
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
stop_iteration components_writer::consume(clustering_row&& cr) {
|
|
ensure_tombstone_is_written();
|
|
_sst.write_clustered_row(_out, _schema, cr);
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
stop_iteration components_writer::consume(range_tombstone&& rt) {
|
|
ensure_tombstone_is_written();
|
|
// Remember the range tombstone so when we need to open a new promoted
|
|
// index block, we can figure out which ranges are still open and need
|
|
// to be repeated in the data file. Note that apply() also drops ranges
|
|
// already closed by rt.start, so the accumulator doesn't grow boundless.
|
|
_sst._pi_write.tombstone_accumulator->apply(rt);
|
|
auto start = composite::from_clustering_element(_schema, std::move(rt.start));
|
|
auto end = composite::from_clustering_element(_schema, std::move(rt.end));
|
|
_sst.maybe_flush_pi_block(_out, start, {});
|
|
_sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb);
|
|
return stop_iteration::no;
|
|
}
|
|
|
|
stop_iteration components_writer::consume_end_of_partition() {
|
|
// If there is an incomplete block in the promoted index, write it too.
|
|
// However, if the _promoted_index is still empty, don't add a single
|
|
// chunk - better not output a promoted index at all in this case.
|
|
if (!_sst._pi_write.data.empty() && !_sst._pi_write.block_first_colname.empty()) {
|
|
output_promoted_index_entry(_sst._pi_write.data,
|
|
_sst._pi_write.block_first_colname,
|
|
_sst._pi_write.block_last_colname,
|
|
_sst._pi_write.block_start_offset - _sst._c_stats.start_offset,
|
|
_out.offset() - _sst._pi_write.block_start_offset);
|
|
_sst._pi_write.numblocks++;
|
|
}
|
|
write_index_promoted(_index, _sst._pi_write.data, _sst._pi_write.deltime,
|
|
_sst._pi_write.numblocks);
|
|
_sst._pi_write.data = {};
|
|
_sst._pi_write.block_first_colname = {};
|
|
|
|
ensure_tombstone_is_written();
|
|
int16_t end_of_row = 0;
|
|
write(_out, end_of_row);
|
|
|
|
// compute size of the current row.
|
|
_sst._c_stats.row_size = _out.offset() - _sst._c_stats.start_offset;
|
|
// update is about merging column_stats with the data being stored by collector.
|
|
_sst._collector.update(std::move(_sst._c_stats));
|
|
_sst._c_stats.reset();
|
|
|
|
if (!_first_key) {
|
|
_first_key = *_partition_key;
|
|
}
|
|
_last_key = std::move(*_partition_key);
|
|
|
|
return get_offset() < _max_sstable_size ? stop_iteration::no : stop_iteration::yes;
|
|
}
|
|
|
|
void components_writer::consume_end_of_stream() {
|
|
seal_summary(_sst._summary, std::move(_first_key), std::move(_last_key)); // what if there is only one partition? what if it is empty?
|
|
|
|
_index.close().get();
|
|
_sst._index_file = file(); // index->close() closed _index_file
|
|
|
|
if (_sst.has_component(sstable::component_type::CompressionInfo)) {
|
|
_sst._collector.add_compression_ratio(_sst._compression.compressed_file_length(), _sst._compression.uncompressed_file_length());
|
|
}
|
|
|
|
// NOTE: Cassandra gets partition name by calling getClass().getCanonicalName() on
|
|
// partition class.
|
|
seal_statistics(_sst._statistics, _sst._collector, dht::global_partitioner().name(), _schema.bloom_filter_fp_chance());
|
|
}
|
|
|
|
future<> sstable::write_components(memtable& mt, bool backup, const io_priority_class& pc, bool leave_unsealed) {
|
|
_collector.set_replay_position(mt.replay_position());
|
|
return write_components(mt.make_reader(mt.schema()),
|
|
mt.partition_count(), mt.schema(), std::numeric_limits<uint64_t>::max(), backup, pc, leave_unsealed);
|
|
}
|
|
|
|
void sstable_writer::prepare_file_writer()
|
|
{
|
|
file_output_stream_options options;
|
|
options.io_priority_class = _pc;
|
|
options.buffer_size = _sst.sstable_buffer_size;
|
|
|
|
if (!_compression_enabled) {
|
|
_writer = make_shared<checksummed_file_writer>(_sst._data_file, std::move(options), true);
|
|
} else {
|
|
prepare_compression(_sst._compression, _schema);
|
|
_writer = make_shared<file_writer>(make_compressed_file_output_stream(_sst._data_file, std::move(options), &_sst._compression));
|
|
}
|
|
}
|
|
|
|
void sstable_writer::finish_file_writer()
|
|
{
|
|
_writer->close().get();
|
|
_sst._data_file = file(); // w->close() closed _data_file
|
|
|
|
if (!_compression_enabled) {
|
|
auto chksum_wr = static_pointer_cast<checksummed_file_writer>(_writer);
|
|
write_digest(_sst.filename(sstable::component_type::Digest), chksum_wr->full_checksum());
|
|
write_crc(_sst.filename(sstable::component_type::CRC), chksum_wr->finalize_checksum());
|
|
} else {
|
|
write_digest(_sst.filename(sstable::component_type::Digest), _sst._compression.full_checksum());
|
|
}
|
|
}
|
|
|
|
sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
|
|
uint64_t max_sstable_size, bool backup, bool leave_unsealed, const io_priority_class& pc)
|
|
: _sst(sst)
|
|
, _schema(s)
|
|
, _pc(pc)
|
|
, _backup(backup)
|
|
, _leave_unsealed(leave_unsealed)
|
|
{
|
|
_sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
|
|
_sst.write_toc(_pc);
|
|
_sst.create_data().get();
|
|
_compression_enabled = !_sst.has_component(sstable::component_type::CRC);
|
|
prepare_file_writer();
|
|
_components_writer.emplace(_sst, _schema, *_writer, estimated_partitions, max_sstable_size, _pc);
|
|
}
|
|
|
|
void sstable_writer::consume_end_of_stream()
|
|
{
|
|
_components_writer->consume_end_of_stream();
|
|
_components_writer = stdx::nullopt;
|
|
finish_file_writer();
|
|
_sst.write_summary(_pc);
|
|
_sst.write_filter(_pc);
|
|
_sst.write_statistics(_pc);
|
|
// NOTE: write_compression means maybe_write_compression.
|
|
_sst.write_compression(_pc);
|
|
|
|
if (!_leave_unsealed) {
|
|
_sst.seal_sstable(_backup).get();
|
|
}
|
|
}
|
|
|
|
future<> sstable::seal_sstable(bool backup)
|
|
{
|
|
return seal_sstable().then([this, backup] {
|
|
if (backup) {
|
|
auto dir = get_dir() + "/backups/";
|
|
return sstable_write_io_check(touch_directory, dir).then([this, dir] {
|
|
return create_links(dir);
|
|
});
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
sstable_writer sstable::get_writer(const schema& s, uint64_t estimated_partitions, uint64_t max_sstable_size,
|
|
bool backup, const io_priority_class& pc, bool leave_unsealed)
|
|
{
|
|
return sstable_writer(*this, s, estimated_partitions, max_sstable_size, backup, leave_unsealed, pc);
|
|
}
|
|
|
|
future<> sstable::write_components(::mutation_reader mr,
|
|
uint64_t estimated_partitions, schema_ptr schema, uint64_t max_sstable_size, bool backup, const io_priority_class& pc, bool leave_unsealed) {
|
|
return seastar::async([this, mr = std::move(mr), estimated_partitions, schema = std::move(schema), max_sstable_size, backup, &pc, leave_unsealed] () mutable {
|
|
auto wr = get_writer(*schema, estimated_partitions, max_sstable_size, backup, pc, leave_unsealed);
|
|
consume_flattened_in_thread(mr, wr);
|
|
});
|
|
}
|
|
|
|
future<> sstable::generate_summary(const io_priority_class& pc) {
|
|
if (_summary) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
sstlog.info("Summary file {} not found. Generating Summary...", filename(sstable::component_type::Summary));
|
|
class summary_generator {
|
|
summary& _summary;
|
|
public:
|
|
std::experimental::optional<key> first_key, last_key;
|
|
|
|
summary_generator(summary& s) : _summary(s) {}
|
|
bool should_continue() {
|
|
return true;
|
|
}
|
|
void consume_entry(index_entry&& ie) {
|
|
maybe_add_summary_entry(_summary, ie.get_key_bytes(), ie.position());
|
|
if (!first_key) {
|
|
first_key = key(to_bytes(ie.get_key_bytes()));
|
|
} else {
|
|
last_key = key(to_bytes(ie.get_key_bytes()));
|
|
}
|
|
}
|
|
};
|
|
|
|
return open_checked_file_dma(sstable_read_error, filename(component_type::Index), open_flags::ro).then([this, &pc] (file index_file) {
|
|
return do_with(std::move(index_file), [this, &pc] (file index_file) {
|
|
return index_file.size().then([this, &pc, index_file] (auto size) {
|
|
// an upper bound. Surely to be less than this.
|
|
auto estimated_partitions = size / sizeof(uint64_t);
|
|
// Since we don't have a summary, use a default min_index_interval, and if needed we'll resample
|
|
// later.
|
|
prepare_summary(_summary, estimated_partitions, 0x80);
|
|
|
|
file_input_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
|
|
return do_with(summary_generator(_summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
|
|
auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), size);
|
|
return ctx->consume_input(*ctx).finally([ctx] {
|
|
return ctx->close();
|
|
}).then([this, ctx, &s] {
|
|
seal_summary(_summary, std::move(s.first_key), std::move(s.last_key));
|
|
});
|
|
});
|
|
}).then([index_file] () mutable {
|
|
return index_file.close().handle_exception([] (auto ep) {
|
|
sstlog.warn("sstable close index_file failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
uint64_t sstable::data_size() const {
|
|
if (has_component(sstable::component_type::CompressionInfo)) {
|
|
return _compression.data_len;
|
|
}
|
|
return _data_file_size;
|
|
}
|
|
|
|
uint64_t sstable::bytes_on_disk() {
|
|
assert(_bytes_on_disk > 0);
|
|
return _bytes_on_disk;
|
|
}
|
|
|
|
const bool sstable::has_component(component_type f) const {
|
|
return _components.count(f);
|
|
}
|
|
|
|
const sstring sstable::filename(component_type f) const {
|
|
return filename(_dir, _ks, _cf, _version, _generation, _format, f);
|
|
}
|
|
|
|
std::vector<sstring> sstable::component_filenames() const {
|
|
std::vector<sstring> res;
|
|
for (auto c : _component_map | boost::adaptors::map_keys) {
|
|
if (has_component(c)) {
|
|
res.emplace_back(filename(c));
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
sstring sstable::toc_filename() const {
|
|
return filename(component_type::TOC);
|
|
}
|
|
|
|
const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_types version, int64_t generation,
|
|
format_types format, component_type component) {
|
|
|
|
static std::unordered_map<version_types, std::function<sstring (entry_descriptor d)>, enum_hash<version_types>> strmap = {
|
|
{ sstable::version_types::ka, [] (entry_descriptor d) {
|
|
return d.ks + "-" + d.cf + "-" + _version_string.at(d.version) + "-" + to_sstring(d.generation) + "-" + _component_map.at(d.component); }
|
|
},
|
|
{ sstable::version_types::la, [] (entry_descriptor d) {
|
|
return _version_string.at(d.version) + "-" + to_sstring(d.generation) + "-" + _format_string.at(d.format) + "-" + _component_map.at(d.component); }
|
|
}
|
|
};
|
|
|
|
return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component));
|
|
}
|
|
|
|
future<> sstable::create_links(sstring dir, int64_t generation) const {
|
|
// TemporaryTOC is always first, TOC is always last
|
|
auto dst = sstable::filename(dir, _ks, _cf, _version, generation, _format, component_type::TemporaryTOC);
|
|
return sstable_write_io_check(::link_file, filename(component_type::TOC), dst).then([dir] {
|
|
return sstable_write_io_check(sync_directory, dir);
|
|
}).then([this, dir, generation] {
|
|
// FIXME: Should clean already-created links if we failed midway.
|
|
return parallel_for_each(_components, [this, dir, generation] (auto comp) {
|
|
if (comp == component_type::TOC) {
|
|
return make_ready_future<>();
|
|
}
|
|
auto dst = sstable::filename(dir, _ks, _cf, _version, generation, _format, comp);
|
|
return sstable_write_io_check(::link_file, this->filename(comp), dst);
|
|
});
|
|
}).then([dir] {
|
|
return sstable_write_io_check(sync_directory, dir);
|
|
}).then([dir, this, generation] {
|
|
auto src = sstable::filename(dir, _ks, _cf, _version, generation, _format, component_type::TemporaryTOC);
|
|
auto dst = sstable::filename(dir, _ks, _cf, _version, generation, _format, component_type::TOC);
|
|
return sstable_write_io_check([&] {
|
|
return engine().rename_file(src, dst);
|
|
});
|
|
}).then([dir] {
|
|
return sstable_write_io_check(sync_directory, dir);
|
|
});
|
|
}
|
|
|
|
future<> sstable::set_generation(int64_t new_generation) {
|
|
return create_links(_dir, new_generation).then([this] {
|
|
return remove_file(filename(component_type::TOC)).then([this] {
|
|
return sstable_write_io_check(sync_directory, _dir);
|
|
}).then([this] {
|
|
return parallel_for_each(_components, [this] (auto comp) {
|
|
if (comp == component_type::TOC) {
|
|
return make_ready_future<>();
|
|
}
|
|
return remove_file(this->filename(comp));
|
|
});
|
|
});
|
|
}).then([this, new_generation] {
|
|
return sync_directory(_dir).then([this, new_generation] {
|
|
_generation = new_generation;
|
|
});
|
|
});
|
|
}
|
|
|
|
entry_descriptor entry_descriptor::make_descriptor(sstring fname) {
|
|
static std::regex la("la-(\\d+)-(\\w+)-(.*)");
|
|
static std::regex ka("(\\w+)-(\\w+)-ka-(\\d+)-(.*)");
|
|
|
|
std::smatch match;
|
|
|
|
sstable::version_types version;
|
|
|
|
sstring generation;
|
|
sstring format;
|
|
sstring component;
|
|
sstring ks;
|
|
sstring cf;
|
|
|
|
std::string s(fname);
|
|
if (std::regex_match(s, match, la)) {
|
|
sstring ks = "";
|
|
sstring cf = "";
|
|
version = sstable::version_types::la;
|
|
generation = match[1].str();
|
|
format = sstring(match[2].str());
|
|
component = sstring(match[3].str());
|
|
} else if (std::regex_match(s, match, ka)) {
|
|
ks = match[1].str();
|
|
cf = match[2].str();
|
|
version = sstable::version_types::ka;
|
|
format = sstring("big");
|
|
generation = match[3].str();
|
|
component = sstring(match[4].str());
|
|
} else {
|
|
throw malformed_sstable_exception(sprint("invalid version for file %s. Name doesn't match any known version.", fname));
|
|
}
|
|
return entry_descriptor(ks, cf, version, boost::lexical_cast<unsigned long>(generation), sstable::format_from_sstring(format), sstable::component_from_sstring(component));
|
|
}
|
|
|
|
sstable::version_types sstable::version_from_sstring(sstring &s) {
|
|
return reverse_map(s, _version_string);
|
|
}
|
|
|
|
sstable::format_types sstable::format_from_sstring(sstring &s) {
|
|
return reverse_map(s, _format_string);
|
|
}
|
|
|
|
sstable::component_type sstable::component_from_sstring(sstring &s) {
|
|
return reverse_map(s, _component_map);
|
|
}
|
|
|
|
input_stream<char> sstable::data_stream(uint64_t pos, size_t len, const io_priority_class& pc) {
|
|
file_input_stream_options options;
|
|
options.buffer_size = sstable_buffer_size;
|
|
options.io_priority_class = pc;
|
|
options.read_ahead = 4;
|
|
if (_compression) {
|
|
return make_compressed_file_input_stream(_data_file, &_compression,
|
|
pos, len, std::move(options));
|
|
} else {
|
|
return make_file_input_stream(_data_file, pos, len, std::move(options));
|
|
}
|
|
}
|
|
|
|
future<temporary_buffer<char>> sstable::data_read(uint64_t pos, size_t len, const io_priority_class& pc) {
|
|
return do_with(data_stream(pos, len, pc), [len] (auto& stream) {
|
|
return stream.read_exactly(len).finally([&stream] {
|
|
return stream.close();
|
|
});
|
|
});
|
|
}
|
|
|
|
partition_key
|
|
sstable::get_first_partition_key(const schema& s) const {
|
|
if (_summary.first_key.value.empty()) {
|
|
throw std::runtime_error("first key of summary is empty");
|
|
}
|
|
return key::from_bytes(_summary.first_key.value).to_partition_key(s);
|
|
}
|
|
|
|
partition_key
|
|
sstable::get_last_partition_key(const schema& s) const {
|
|
if (_summary.last_key.value.empty()) {
|
|
throw std::runtime_error("last key of summary is empty");
|
|
}
|
|
return key::from_bytes(_summary.last_key.value).to_partition_key(s);
|
|
}
|
|
|
|
dht::decorated_key sstable::get_first_decorated_key(const schema& s) const {
|
|
// FIXME: we can avoid generating the decorated key over and over again by
|
|
// storing it in the sstable object. The same applies to last().
|
|
auto pk = get_first_partition_key(s);
|
|
return dht::global_partitioner().decorate_key(s, std::move(pk));
|
|
}
|
|
|
|
dht::decorated_key sstable::get_last_decorated_key(const schema& s) const {
|
|
auto pk = get_last_partition_key(s);
|
|
return dht::global_partitioner().decorate_key(s, std::move(pk));
|
|
}
|
|
|
|
int sstable::compare_by_first_key(const schema& s, const sstable& other) const {
|
|
return get_first_decorated_key(s).tri_compare(s, other.get_first_decorated_key(s));
|
|
}
|
|
|
|
double sstable::get_compression_ratio() const {
|
|
if (this->has_component(sstable::component_type::CompressionInfo)) {
|
|
return (double) _compression.compressed_file_length() / _compression.uncompressed_file_length();
|
|
} else {
|
|
return metadata_collector::NO_COMPRESSION_RATIO;
|
|
}
|
|
}
|
|
|
|
void sstable::set_sstable_level(uint32_t new_level) {
|
|
auto entry = _statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _statistics.contents.end()) {
|
|
return;
|
|
}
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
sstlog.debug("set level of {} with generation {} from {} to {}", get_filename(), _generation, s.sstable_level, new_level);
|
|
s.sstable_level = new_level;
|
|
}
|
|
|
|
future<> sstable::mutate_sstable_level(uint32_t new_level) {
|
|
if (!has_component(component_type::Statistics)) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto entry = _statistics.contents.find(metadata_type::Stats);
|
|
if (entry == _statistics.contents.end()) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
auto& p = entry->second;
|
|
if (!p) {
|
|
throw std::runtime_error("Statistics is malformed");
|
|
}
|
|
stats_metadata& s = *static_cast<stats_metadata *>(p.get());
|
|
if (s.sstable_level == new_level) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
s.sstable_level = new_level;
|
|
// Technically we don't have to write the whole file again. But the assumption that
|
|
// we will always write sequentially is a powerful one, and this does not merit an
|
|
// exception.
|
|
return seastar::async([this] {
|
|
// This is not part of the standard memtable flush path, but there is no reason
|
|
// to come up with a class just for that. It is used by the snapshot/restore mechanism
|
|
// which comprises mostly hard link creation and this operation at the end + this operation,
|
|
// and also (eventually) by some compaction strategy. In any of the cases, it won't be high
|
|
// priority enough so we will use the default priority
|
|
rewrite_statistics(default_priority_class());
|
|
});
|
|
}
|
|
|
|
int sstable::compare_by_max_timestamp(const sstable& other) const {
|
|
auto ts1 = get_stats_metadata().max_timestamp;
|
|
auto ts2 = other.get_stats_metadata().max_timestamp;
|
|
return (ts1 > ts2 ? 1 : (ts1 == ts2 ? 0 : -1));
|
|
}
|
|
|
|
sstable::~sstable() {
|
|
if (_index_file) {
|
|
_index_file.close().handle_exception([save = _index_file, op = background_jobs().start()] (auto ep) {
|
|
sstlog.warn("sstable close index_file failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
}
|
|
if (_data_file) {
|
|
_data_file.close().handle_exception([save = _data_file, op = background_jobs().start()] (auto ep) {
|
|
sstlog.warn("sstable close data_file failed: {}", ep);
|
|
general_disk_error();
|
|
});
|
|
}
|
|
|
|
if (_marked_for_deletion) {
|
|
// We need to delete the on-disk files for this table. Since this is a
|
|
// destructor, we can't wait for this to finish, or return any errors,
|
|
// but just need to do our best. If a deletion fails for some reason we
|
|
// log and ignore this failure, because on startup we'll again try to
|
|
// clean up unused sstables, and because we'll never reuse the same
|
|
// generation number anyway.
|
|
try {
|
|
delete_atomically({sstable_to_delete(filename(component_type::TOC), _shared)}).handle_exception(
|
|
[op = background_jobs().start()] (std::exception_ptr eptr) {
|
|
try {
|
|
std::rethrow_exception(eptr);
|
|
} catch (atomic_deletion_cancelled&) {
|
|
sstlog.debug("Exception when deleting sstable file: {}", eptr);
|
|
} catch (...) {
|
|
sstlog.warn("Exception when deleting sstable file: {}", eptr);
|
|
}
|
|
});
|
|
} catch (...) {
|
|
sstlog.warn("Exception when deleting sstable file: {}", std::current_exception());
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
sstring
|
|
dirname(sstring fname) {
|
|
return boost::filesystem::canonical(std::string(fname)).parent_path().string();
|
|
}
|
|
|
|
future<>
|
|
fsync_directory(sstring fname) {
|
|
return sstable_write_io_check([&] {
|
|
return open_checked_directory(sstable_write_error ,dirname(fname)).then([] (file f) {
|
|
return do_with(std::move(f), [] (file& f) {
|
|
return f.flush();
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<>
|
|
remove_by_toc_name(sstring sstable_toc_name) {
|
|
return seastar::async([sstable_toc_name] {
|
|
sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
|
|
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
|
sstring dir;
|
|
|
|
if (sstable_write_io_check(file_exists, sstable_toc_name).get0()) {
|
|
dir = dirname(sstable_toc_name);
|
|
sstable_write_io_check(rename_file, sstable_toc_name, new_toc_name).get();
|
|
sstable_write_io_check(fsync_directory, dir).get();
|
|
} else {
|
|
dir = dirname(new_toc_name);
|
|
}
|
|
|
|
auto toc_file = open_checked_file_dma(sstable_read_error, new_toc_name, open_flags::ro).get0();
|
|
auto in = make_file_input_stream(toc_file);
|
|
auto size = toc_file.size().get0();
|
|
auto text = in.read_exactly(size).get0();
|
|
in.close().get();
|
|
std::vector<sstring> components;
|
|
sstring all(text.begin(), text.end());
|
|
boost::split(components, all, boost::is_any_of("\n"));
|
|
parallel_for_each(components, [prefix] (sstring component) {
|
|
if (component.empty()) {
|
|
// eof
|
|
return make_ready_future<>();
|
|
}
|
|
if (component == TOC_SUFFIX) {
|
|
// already deleted
|
|
return make_ready_future<>();
|
|
}
|
|
auto fname = prefix + component;
|
|
return sstable_write_io_check(remove_file, prefix + component).then_wrapped([fname = std::move(fname)] (future<> f) {
|
|
// forgive ENOENT, since the component may not have been written;
|
|
try {
|
|
f.get();
|
|
} catch (std::system_error& e) {
|
|
if (!is_system_error_errno(ENOENT)) {
|
|
throw;
|
|
}
|
|
sstlog.debug("Forgiving ENOENT when deleting file {}", fname);
|
|
}
|
|
return make_ready_future<>();
|
|
});
|
|
}).get();
|
|
sstable_write_io_check(fsync_directory, dir).get();
|
|
sstable_write_io_check(remove_file, new_toc_name).get();
|
|
});
|
|
}
|
|
|
|
future<>
|
|
sstable::mark_for_deletion_on_disk() {
|
|
mark_for_deletion();
|
|
|
|
auto toc_name = filename(component_type::TOC);
|
|
auto shard = std::hash<sstring>()(toc_name) % smp::count;
|
|
|
|
return smp::submit_to(shard, [toc_name] {
|
|
static thread_local std::unordered_set<sstring> renaming;
|
|
|
|
if (renaming.count(toc_name) > 0) {
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
renaming.emplace(toc_name);
|
|
|
|
return seastar::async([toc_name] {
|
|
if (!sstable_write_io_check(file_exists, toc_name).get0()) {
|
|
return; // already gone
|
|
}
|
|
|
|
auto dir = dirname(toc_name);
|
|
auto toc_file = open_checked_file_dma(sstable_read_error, toc_name, open_flags::ro).get0();
|
|
sstring prefix = toc_name.substr(0, toc_name.size() - TOC_SUFFIX.size());
|
|
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
|
sstable_write_io_check(rename_file, toc_name, new_toc_name).get();
|
|
sstable_write_io_check(fsync_directory, dir).get();
|
|
}).finally([toc_name] {
|
|
renaming.erase(toc_name);
|
|
});
|
|
});
|
|
}
|
|
|
|
future<>
|
|
sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
|
|
return seastar::async([ks, cf, dir, generation, v, f] {
|
|
auto toc = sstable_write_io_check(file_exists, filename(dir, ks, cf, v, generation, f, component_type::TOC)).get0();
|
|
// assert that toc doesn't exist for sstable with temporary toc.
|
|
assert(toc == false);
|
|
|
|
auto tmptoc = sstable_write_io_check(file_exists, filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get0();
|
|
// assert that temporary toc exists for this sstable.
|
|
assert(tmptoc == true);
|
|
|
|
sstlog.warn("Deleting components of sstable from {}.{} of generation {} that has a temporary TOC", ks, cf, generation);
|
|
|
|
for (auto& entry : sstable::_component_map) {
|
|
// Skipping TemporaryTOC because it must be the last component to
|
|
// be deleted, and unordered map doesn't guarantee ordering.
|
|
// This is needed because we may end up with a partial delete in
|
|
// event of a power failure.
|
|
// If TemporaryTOC is deleted prematurely and scylla crashes,
|
|
// the subsequent boot would fail because of that generation
|
|
// missing a TOC.
|
|
if (entry.first == component_type::TemporaryTOC) {
|
|
continue;
|
|
}
|
|
|
|
auto file_path = filename(dir, ks, cf, v, generation, f, entry.first);
|
|
// Skip component that doesn't exist.
|
|
auto exists = sstable_write_io_check(file_exists, file_path).get0();
|
|
if (!exists) {
|
|
continue;
|
|
}
|
|
sstable_write_io_check(remove_file, file_path).get();
|
|
}
|
|
sstable_write_io_check(fsync_directory, dir).get();
|
|
// Removing temporary
|
|
sstable_write_io_check(remove_file, filename(dir, ks, cf, v, generation, f, component_type::TemporaryTOC)).get();
|
|
// Fsync'ing column family dir to guarantee that deletion completed.
|
|
sstable_write_io_check(fsync_directory, dir).get();
|
|
});
|
|
}
|
|
|
|
future<range<partition_key>>
|
|
sstable::get_sstable_key_range(const schema& s) {
|
|
auto fut = read_summary(default_priority_class());
|
|
return std::move(fut).then([this, &s] () mutable {
|
|
auto first = get_first_partition_key(s);
|
|
auto last = get_last_partition_key(s);
|
|
return make_ready_future<range<partition_key>>(range<partition_key>::make(first, last));
|
|
});
|
|
}
|
|
|
|
void sstable::mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
|
|
auto sst = sstable(ks, cf, dir, generation, v, f);
|
|
sst.mark_for_deletion();
|
|
}
|
|
|
|
std::ostream&
|
|
operator<<(std::ostream& os, const sstable_to_delete& std) {
|
|
return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
|
|
}
|
|
|
|
using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
|
|
using sstables_to_delete_atomically_type = std::set<sstring>;
|
|
struct pending_deletion {
|
|
sstables_to_delete_atomically_type names;
|
|
std::vector<lw_shared_ptr<promise<>>> completions;
|
|
};
|
|
|
|
static thread_local bool g_atomic_deletions_cancelled = false;
|
|
static thread_local std::list<lw_shared_ptr<pending_deletion>> g_atomic_deletion_sets;
|
|
static thread_local std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> g_shards_agreeing_to_delete_sstable;
|
|
|
|
static logging::logger deletion_logger("sstable-deletion");
|
|
|
|
static
|
|
future<>
|
|
do_delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
|
|
// runs on shard 0 only
|
|
deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
|
|
|
|
if (g_atomic_deletions_cancelled) {
|
|
deletion_logger.debug("atomic deletions disabled, erroring out");
|
|
using boost::adaptors::transformed;
|
|
throw atomic_deletion_cancelled(atomic_deletion_set
|
|
| transformed(std::mem_fn(&sstable_to_delete::name)));
|
|
}
|
|
|
|
// Insert atomic_deletion_set into the list of sets pending deletion. If the new set
|
|
// overlaps with an existing set, merge them (the merged set will be deleted atomically).
|
|
std::list<lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
|
|
auto merged_set = make_lw_shared(pending_deletion());
|
|
for (auto&& sst_to_delete : atomic_deletion_set) {
|
|
merged_set->names.insert(sst_to_delete.name);
|
|
if (!sst_to_delete.shared) {
|
|
for (auto shard : boost::irange<shard_id>(0, smp::count)) {
|
|
g_shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
|
|
}
|
|
}
|
|
}
|
|
merged_set->completions.push_back(make_lw_shared<promise<>>());
|
|
auto ret = merged_set->completions.back()->get_future();
|
|
for (auto&& old_set : g_atomic_deletion_sets) {
|
|
auto intersection = sstables_to_delete_atomically_type();
|
|
boost::set_intersection(merged_set->names, old_set->names, std::inserter(intersection, intersection.end()));
|
|
if (intersection.empty()) {
|
|
// We copy old_set to avoid corrupting g_atomic_deletion_sets if we fail
|
|
// further on.
|
|
new_atomic_deletion_sets.push_back(old_set);
|
|
} else {
|
|
deletion_logger.debug("merging with {}", old_set->names);
|
|
merged_set->names.insert(old_set->names.begin(), old_set->names.end());
|
|
boost::push_back(merged_set->completions, old_set->completions);
|
|
}
|
|
}
|
|
deletion_logger.debug("new atomic set: {}", merged_set->names);
|
|
new_atomic_deletion_sets.push_back(merged_set);
|
|
// can now exception-safely commit:
|
|
g_atomic_deletion_sets = std::move(new_atomic_deletion_sets);
|
|
|
|
// Mark each sstable as being deleted from deleting_shard. We have to do
|
|
// this in a separate pass, so the consideration whether we can delete or not
|
|
// sees all the data from this pass.
|
|
for (auto&& sst : atomic_deletion_set) {
|
|
g_shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
|
|
}
|
|
|
|
// Figure out if the (possibly merged) set can be deleted
|
|
for (auto&& sst : merged_set->names) {
|
|
if (g_shards_agreeing_to_delete_sstable[sst].size() != smp::count) {
|
|
// Not everyone agrees, leave the set pending
|
|
deletion_logger.debug("deferring deletion until all shards agree");
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
// Cannot recover from a failed deletion
|
|
g_atomic_deletion_sets.pop_back();
|
|
for (auto&& name : merged_set->names) {
|
|
g_shards_agreeing_to_delete_sstable.erase(name);
|
|
}
|
|
|
|
// Everyone agrees, let's delete
|
|
// FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
|
|
parallel_for_each(merged_set->names, [] (sstring name) {
|
|
deletion_logger.debug("deleting {}", name);
|
|
return remove_by_toc_name(name);
|
|
}).then_wrapped([merged_set] (future<> result) {
|
|
deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
|
|
shared_future<> sf(std::move(result));
|
|
for (auto&& comp : merged_set->completions) {
|
|
sf.get_future().forward_to(std::move(*comp));
|
|
}
|
|
});
|
|
|
|
return ret;
|
|
}
|
|
|
|
future<>
|
|
delete_atomically(std::vector<sstable_to_delete> ssts) {
|
|
auto shard = engine().cpu_id();
|
|
return smp::submit_to(0, [=] {
|
|
return do_delete_atomically(ssts, shard);
|
|
});
|
|
}
|
|
|
|
future<>
|
|
delete_atomically(std::vector<shared_sstable> ssts) {
|
|
std::vector<sstable_to_delete> sstables_to_delete_atomically;
|
|
for (auto&& sst : ssts) {
|
|
sstables_to_delete_atomically.push_back({sst->toc_filename(), sst->is_shared()});
|
|
}
|
|
return delete_atomically(std::move(sstables_to_delete_atomically));
|
|
}
|
|
|
|
void
|
|
cancel_atomic_deletions() {
|
|
g_atomic_deletions_cancelled = true;
|
|
for (auto&& pd : g_atomic_deletion_sets) {
|
|
for (auto&& c : pd->completions) {
|
|
c->set_exception(atomic_deletion_cancelled(pd->names));
|
|
}
|
|
}
|
|
g_atomic_deletion_sets.clear();
|
|
g_shards_agreeing_to_delete_sstable.clear();
|
|
}
|
|
|
|
atomic_deletion_cancelled::atomic_deletion_cancelled(std::vector<sstring> names)
|
|
: _msg(sprint("atomic deletions cancelled; not deleting %s", names)) {
|
|
}
|
|
|
|
const char*
|
|
atomic_deletion_cancelled::what() const noexcept {
|
|
return _msg.c_str();
|
|
}
|
|
|
|
}
|