mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-25 19:10:42 +00:00
For efficiency, the cardinality of the bloom filter (i.e. the number of partition keys which will be written into the sstable) has to be known before elements are inserted into the filter. In some cases (e.g. memtables flush) this number is known exactly. But in others (e.g. repair) it can only be estimated, and the estimation might be very wrong, leading to an oversized filter. Because of that, some time ago we added a piece of logic (ran after the sstable is written, but before it's sealed) which looks at the actual number of written partitions, compares it to the initial estimate (on which the size of the bloom filter was based on), and if the difference is unacceptably large, it rewrites the bloom filter from partition keys contained in Index.db. But the idea to rebuild the bloom filters from index files isn't going to work with BTI indexes, because they don't store whole partition keys. If we want sstables which don't have Index.db files, we need some other way to deal with oversized filters. Partition keys can be recovered from Data.db, but that would often be way too expensive. This patch adds another way. We introduce a new component file, TemporaryHashes. This component, if written at all, contains the 16-byte murmur hash for every partition key, in order, and can be used in place of Index to reconstruct the bloom filter. (Our bloom filters are actually built from the set of murmur hashes of partition keys. The first step of inserting a partition key into a filter is hashing the key. Remembering the hashes is sufficient to build the filter later, without looking at partition keys again.) As of this patch, if the Index component is not being written, we don't allocate and populate a bloom filter during the Data.db write. Instead, we write the murmur hashes to TemporaryHashes, and only later, after the Data write finishes, we allocate the optimal-size, bloom filter, we read the hashes back from TemporaryHashes, and we populate the filter with them. That is suboptimal. Writing the hashes to disk (or worse, to S3) and reading them back is more expensive than building the bloom filter during the main Data pass. So ideally it should be avoided in cases where we know in advance that the partition key count estimate is good enough. (Which should be the case in flushes and compactions). But we defer that to a future patch. (Such a change would involve passing some flag to the sstable writer if the cardinality estimate is trustworthy, and not creating TemporaryHashes if the estimate is trustworthy).
131 lines
5.1 KiB
C++
131 lines
5.1 KiB
C++
/*
|
|
* Copyright (C) 2015-present ScyllaDB
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "utils/assert.hh"
|
|
#include <filesystem>
|
|
|
|
#include <seastar/core/file.hh>
|
|
#include <seastar/core/fstream.hh>
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/reactor.hh>
|
|
|
|
#include "data_dictionary/storage_options.hh"
|
|
#include "seastarx.hh"
|
|
#include "sstables/shared_sstable.hh"
|
|
#include "sstables/component_type.hh"
|
|
#include "sstables/generation_type.hh"
|
|
#include "utils/disk-error-handler.hh"
|
|
|
|
class schema;
|
|
|
|
namespace data_dictionary {
|
|
class storage_options;
|
|
}
|
|
|
|
namespace db { class config; }
|
|
|
|
namespace sstables {
|
|
|
|
enum class sstable_state;
|
|
class delayed_commit_changes;
|
|
class sstable;
|
|
class sstables_manager;
|
|
class entry_descriptor;
|
|
|
|
struct atomic_delete_context {
|
|
sstring pending_delete_log;
|
|
std::unordered_set<sstring> prefixes;
|
|
};
|
|
|
|
class opened_directory final {
|
|
std::filesystem::path _pathname;
|
|
file _file;
|
|
|
|
public:
|
|
explicit opened_directory(std::filesystem::path pathname) : _pathname(std::move(pathname)) {};
|
|
explicit opened_directory(const sstring &dir) : _pathname(std::string_view(dir)) {};
|
|
opened_directory(const opened_directory&) = delete;
|
|
opened_directory& operator=(const opened_directory&) = delete;
|
|
opened_directory(opened_directory&&) = default;
|
|
opened_directory& operator=(opened_directory&&) = default;
|
|
~opened_directory() = default;
|
|
|
|
const std::filesystem::path::string_type& native() const noexcept {
|
|
return _pathname.native();
|
|
}
|
|
|
|
const std::filesystem::path& path() const noexcept {
|
|
return _pathname;
|
|
}
|
|
|
|
future<> sync(io_error_handler error_handler) {
|
|
if (!_file) {
|
|
_file = co_await do_io_check(error_handler, open_directory, _pathname.native());
|
|
}
|
|
co_await do_io_check(error_handler, std::mem_fn(&file::flush), _file);
|
|
};
|
|
|
|
future<> close() {
|
|
return _file ? _file.close() : make_ready_future<>();
|
|
}
|
|
};
|
|
|
|
class storage {
|
|
friend class test;
|
|
|
|
// Internal, but can also be used by tests
|
|
virtual future<> change_dir_for_test(sstring nd) {
|
|
SCYLLA_ASSERT(false && "Changing directory not implemented");
|
|
}
|
|
virtual future<> create_links(const sstable& sst, const std::filesystem::path& dir) const {
|
|
SCYLLA_ASSERT(false && "Direct links creation not implemented");
|
|
}
|
|
virtual future<> move(const sstable& sst, sstring new_dir, generation_type generation, delayed_commit_changes* delay) {
|
|
SCYLLA_ASSERT(false && "Direct move not implemented");
|
|
}
|
|
|
|
public:
|
|
virtual ~storage() {}
|
|
|
|
using absolute_path = bool_class<class absolute_path_tag>; // FIXME -- should go away eventually
|
|
using sync_dir = bool_class<struct sync_dir_tag>; // meaningful only to filesystem storage
|
|
|
|
virtual future<> seal(const sstable& sst) = 0;
|
|
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen = {}) const = 0;
|
|
virtual future<> change_state(const sstable& sst, sstable_state to, generation_type generation, delayed_commit_changes* delay) = 0;
|
|
// runs in async context
|
|
virtual void open(sstable& sst) = 0;
|
|
virtual future<> wipe(const sstable& sst, sync_dir) noexcept = 0;
|
|
virtual future<file> open_component(const sstable& sst, component_type type, open_flags flags, file_open_options options, bool check_integrity) = 0;
|
|
virtual future<data_sink> make_data_or_index_sink(sstable& sst, component_type type) = 0;
|
|
virtual future<data_source> make_data_or_index_source(sstable& sst, component_type type, file f, uint64_t offset, uint64_t len, file_input_stream_options opt) const = 0;
|
|
virtual future<data_sink> make_component_sink(sstable& sst, component_type type, open_flags oflags, file_output_stream_options options) = 0;
|
|
virtual future<> destroy(const sstable& sst) = 0;
|
|
virtual future<atomic_delete_context> atomic_delete_prepare(const std::vector<shared_sstable>&) const = 0;
|
|
virtual future<> atomic_delete_complete(atomic_delete_context ctx) const = 0;
|
|
virtual future<> remove_by_registry_entry(entry_descriptor desc) = 0;
|
|
// Free space available in the underlying storage.
|
|
virtual future<uint64_t> free_space() const = 0;
|
|
virtual future<> unlink_component(const sstable& sst, component_type) noexcept = 0;
|
|
|
|
virtual sstring prefix() const = 0;
|
|
};
|
|
|
|
std::unique_ptr<sstables::storage> make_storage(sstables_manager& manager, const data_dictionary::storage_options& s_opts, sstable_state state);
|
|
future<lw_shared_ptr<const data_dictionary::storage_options>> init_table_storage(const sstables_manager&, const schema&, const data_dictionary::storage_options& so);
|
|
future<> destroy_table_storage(const data_dictionary::storage_options& so);
|
|
future<> init_keyspace_storage(const sstables_manager&, const data_dictionary::storage_options& so, sstring ks_name);
|
|
|
|
std::vector<std::filesystem::path> get_local_directories(const db::config& db, const data_dictionary::storage_options::local& so);
|
|
|
|
using data_source_creator_fn = std::function<data_source(uint64_t, uint64_t)>;
|
|
} // namespace sstables
|