/* * Copyright (C) 2016-present ScyllaDB */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ #define ZSTD_STATIC_LINKING_ONLY #include #include #include #include #include #include #include #include #include #include #include "utils/reusable_buffer.hh" #include "sstables/compress.hh" #include "sstables/exceptions.hh" #include "utils/hashers.hh" #include "sstables/sstable_compressor_factory.hh" #include "compress.hh" #include "exceptions/exceptions.hh" #include "utils/class_registrator.hh" #include "gms/feature_service.hh" // SHA256 using dict_id = std::array; class dictionary_holder; static seastar::logger compressor_factory_logger("sstable_compressor_factory"); template <> struct fmt::formatter : fmt::formatter { auto format(const compression_parameters::algorithm& alg, fmt::format_context& ctx) const { return fmt::format_to(ctx.out(), "{}", compression_parameters::algorithm_to_name(alg)); } }; // Holds a raw dictionary blob (without algorithm-specific hash tables). // raw dicts might be used (and kept alive) directly by compressors (in particular, lz4 decompressor) // or referenced by algorithm-specific dicts. class raw_dict : public enable_lw_shared_from_this { weak_ptr _owner; dict_id _id; std::vector _dict; public: raw_dict(dictionary_holder& owner, dict_id key, std::span dict); ~raw_dict(); const std::span raw() const { return _dict; } dict_id id() const { return _id; } }; // A custom allocator for zstd, so that we can track its memory usage. struct zstd_callback_allocator { using callback_type = std::function; callback_type _callback; using self = zstd_callback_allocator; zstd_callback_allocator(callback_type cb) : _callback(std::move(cb)) {} zstd_callback_allocator(self&&) = delete; ZSTD_customMem as_zstd_custommem() & { return ZSTD_customMem{ .customAlloc = [] (void* opaque, size_t n) -> void* { auto addr = malloc(n); static_cast(opaque)->_callback(static_cast(malloc_usable_size(addr))); return addr; }, .customFree = [] (void* opaque, void* addr) { static_cast(opaque)->_callback(-malloc_usable_size(addr)); free(addr); return; }, .opaque = static_cast(this), }; } }; // Holds a zstd-specific decompression dictionary // (which internally holds a pointer to the raw dictionary blob // and parsed entropy tables). class zstd_ddict : public enable_lw_shared_from_this { weak_ptr _owner; lw_shared_ptr _raw; size_t _used_memory = 0; zstd_callback_allocator _alloc; std::unique_ptr _dict; public: zstd_ddict(dictionary_holder& owner, lw_shared_ptr raw); ~zstd_ddict(); auto dict() const { return _dict.get(); } auto raw() const { return _raw->raw(); } dict_id id() const { return _raw->id(); } }; // Holds a zstd-specific decompression dictionary // (which internally holds a pointer to the raw dictionary blob, // indices over the blob, and entropy tables). // // Note that the index stored inside this dict is level-specific, // so the level of compression is decided at the time of construction // of this dict. class zstd_cdict : public enable_lw_shared_from_this { weak_ptr _owner; lw_shared_ptr _raw; int _level; size_t _used_memory = 0; zstd_callback_allocator _alloc; std::unique_ptr _dict; public: zstd_cdict(dictionary_holder& owner, lw_shared_ptr raw, int level); ~zstd_cdict(); auto dict() const { return _dict.get(); } auto raw() const { return _raw->raw(); } dict_id id() const {return _raw->id(); } }; // Holds a lz4-specific compression dictionary // (which internally holds a pointer to the raw dictionary blob, // and a hash index over the substrings of the blob). // class lz4_cdict : public enable_lw_shared_from_this { weak_ptr _owner; lw_shared_ptr _raw; std::unique_ptr _dict; public: lz4_cdict(dictionary_holder& owner, lw_shared_ptr raw); ~lz4_cdict(); auto dict() const { return _dict.get(); } auto raw() const { return _raw->raw(); } auto id() const { return _raw->id(); } }; // A lz4 compressor for SSTables. // // Compression and decompression dicts can be passed to it via the constructor, // and they will be used for compression and decompression respectively. // // If only the decompression dict is passed, calling `compress()` is illegal. // If only the compression dict is passed, calling `decompress()` is illegal. // If both dicts or none are passed, both `compress()` and `decompress()` are legal. // // (The reason we want to allow passing only one dict is that we want to discard // compression dicts after the SSTable is written. They are much bigger then decompression // dicts, and they won't be useful anymore, so it makes sense to free them.) class lz4_processor: public compressor { public: using cdict_ptr = foreign_ptr>; using ddict_ptr = foreign_ptr>; private: cdict_ptr _cdict; ddict_ptr _ddict; static LZ4_stream_t* get_cctx() { static thread_local auto cctx = std::unique_ptr{LZ4_createStream(), LZ4_freeStream}; return cctx.get(); } public: lz4_processor(cdict_ptr = nullptr, ddict_ptr = nullptr); // Legal if `_ddict || !_cdict`. size_t uncompress(const char* input, size_t input_len, char* output, size_t output_len) const override; // Legal if `_cdict || !_ddict`. size_t compress(const char* input, size_t input_len, char* output, size_t output_len) const override; size_t compress_max_size(size_t input_len) const override; std::map options() const override; algorithm get_algorithm() const override; std::optional get_dict_owner_for_test() const override; }; class snappy_processor: public compressor { public: size_t uncompress(const char* input, size_t input_len, char* output, size_t output_len) const override; size_t compress(const char* input, size_t input_len, char* output, size_t output_len) const override; size_t compress_max_size(size_t input_len) const override; algorithm get_algorithm() const override { return algorithm::snappy; } }; class deflate_processor: public compressor { public: size_t uncompress(const char* input, size_t input_len, char* output, size_t output_len) const override; size_t compress(const char* input, size_t input_len, char* output, size_t output_len) const override; size_t compress_max_size(size_t input_len) const override; algorithm get_algorithm() const override { return algorithm::deflate; } }; static const sstring COMPRESSION_LEVEL = "compression_level"; // A Zstd compressor for SSTables. // // Compression and decompression dicts can be passed to it via the constructor, // and they will be used for compression and decompression respectively. // // If only the decompression dict is passed, calling `compress()` is illegal. // If only the compression dict is passed, calling `decompress()` is illegal. // If both dicts or none are passed, both `compress()` and `decompress()` are legal. // // (The reason we want to allow passing only one dict is that we want to discard // compression dicts after the SSTable is written. They are much bigger then decompression // dicts, and they won't be useful anymore, so it makes sense to free them.) class zstd_processor : public compressor { int _compression_level = 3; size_t _cctx_size; using cdict_ptr = foreign_ptr>; using ddict_ptr = foreign_ptr>; cdict_ptr _cdict; ddict_ptr _ddict; static auto with_dctx(std::invocable auto f) { static const size_t DCTX_SIZE = ZSTD_estimateDCtxSize(); // The decompression context has a fixed size of ~128 KiB, // so we don't bother ever resizing it the way we do with // the compression context. static thread_local std::unique_ptr buf = std::invoke([&] { auto ptr = std::unique_ptr(new char[DCTX_SIZE]); auto dctx = ZSTD_initStaticDCtx(ptr.get(), DCTX_SIZE); if (!dctx) { // Barring a bug, this should never happen. throw std::runtime_error("Unable to initialize ZSTD decompression context"); } return ptr; }); return f(reinterpret_cast(buf.get())); } static auto with_cctx(size_t cctx_size, std::invocable auto f) { // See the comments to reusable_buffer for a rationale of using it for compression. static thread_local utils::reusable_buffer buf(std::chrono::seconds(600)); static thread_local size_t last_seen_reallocs = buf.reallocs(); auto guard = utils::reusable_buffer_guard(buf); // Note that the compression context isn't initialized with a particular // compression config, but only with a particular size. As long as // it is big enough, we can reuse a context initialized by an // unrelated instance of zstd_processor without reinitializing it. // // If the existing context isn't big enough, the reusable buffer will // be resized by the next line, and the following `if` will notice that // and reinitialize the context. auto view = guard.get_temporary_buffer(cctx_size); if (last_seen_reallocs != buf.reallocs()) { // Either the buffer just grew because we requested a buffer bigger // than its last capacity, or it was shrunk some time ago by a timer. // Either way, the resize destroyed the contents of the buffer and // we have to initialize the context anew. auto cctx = ZSTD_initStaticCCtx(view.data(), buf.size()); if (!cctx) { // Barring a bug, this should never happen. throw std::runtime_error("Unable to initialize ZSTD compression context"); } last_seen_reallocs = buf.reallocs(); } return f(reinterpret_cast(view.data())); } public: zstd_processor(const compression_parameters&, cdict_ptr, ddict_ptr); // Legal if `(_ddict || !_cdict)`. size_t uncompress(const char* input, size_t input_len, char* output, size_t output_len) const override; // Legal if `(_cdict || !_ddict)`. size_t compress(const char* input, size_t input_len, char* output, size_t output_len) const override; size_t compress_max_size(size_t input_len) const override; algorithm get_algorithm() const override; std::map options() const override; std::optional get_dict_owner_for_test() const override; }; zstd_processor::zstd_processor(const compression_parameters& opts, cdict_ptr cdict, ddict_ptr ddict) { _cdict = std::move(cdict); _ddict = std::move(ddict); if (auto level = opts.zstd_compression_level()) { _compression_level = *level; } // The memory needed by the compression context depends both on the input // size and on the dictionary size. size_t dict_len = _cdict ? _cdict->raw().size() : 0; // We assume that the uncompressed input length is always <= chunk_len. auto chunk_len = opts.chunk_length(); auto cparams = ZSTD_getCParams(_compression_level, chunk_len, dict_len); _cctx_size = ZSTD_estimateCCtxSize_usingCParams(cparams); } size_t zstd_processor::uncompress(const char* input, size_t input_len, char* output, size_t output_len) const { auto ret = with_dctx([&] (ZSTD_DCtx* dctx) { if (_ddict) { return ZSTD_decompress_usingDDict(dctx, output, output_len, input, input_len, _ddict->dict()); } else { SCYLLA_ASSERT(!_cdict && "Write-only compressor used for reading"); return ZSTD_decompressDCtx(dctx, output, output_len, input, input_len); } }); if (ZSTD_isError(ret)) { throw std::runtime_error( format("ZSTD decompression failure: {}", ZSTD_getErrorName(ret))); } return ret; } size_t zstd_processor::compress(const char* input, size_t input_len, char* output, size_t output_len) const { auto ret = with_cctx(_cctx_size, [&] (ZSTD_CCtx* cctx) { if (_cdict) { return ZSTD_compress_usingCDict(cctx, output, output_len, input, input_len, _cdict->dict()); } else { SCYLLA_ASSERT(!_ddict && "Read-only compressor used for writing"); return ZSTD_compressCCtx(cctx, output, output_len, input, input_len, _compression_level); } }); if (ZSTD_isError(ret)) { throw std::runtime_error( format("ZSTD compression failure: {}", ZSTD_getErrorName(ret))); } return ret; } size_t zstd_processor::compress_max_size(size_t input_len) const { return ZSTD_compressBound(input_len); } auto zstd_processor::get_algorithm() const -> algorithm { return (_cdict || _ddict) ? algorithm::zstd_with_dicts : algorithm::zstd; } std::optional zstd_processor::get_dict_owner_for_test() const { if (_cdict) { return _cdict.get_owner_shard(); } else if (_ddict) { return _ddict.get_owner_shard(); } else { return std::nullopt; } } const std::string_view DICTIONARY_OPTION = ".dictionary."; static std::map dict_as_options(std::span d) { std::map result; const size_t max_part_size = std::numeric_limits::max() - 1; while (!d.empty()) { auto this_part_size = std::min(max_part_size, d.size()); auto part_name = fmt::format("{}{:08}", DICTIONARY_OPTION, result.size()); auto part = d.subspan(0, this_part_size); auto part_as_string = std::string_view(reinterpret_cast(part.data()), part.size()); result.emplace(part_name, part_as_string); d = d.subspan(this_part_size); } return result; } static std::optional> dict_from_options(const sstables::compression& c) { std::map parts; for (const auto& [k, v] : c.options.elements) { auto k_str = sstring(k.value.begin(), k.value.end()); if (k_str.starts_with(DICTIONARY_OPTION)) { try { auto i = std::stoi(k_str.substr(DICTIONARY_OPTION.size())); parts.emplace(i, v.value); } catch (const std::exception& e) { throw sstables::malformed_sstable_exception(fmt::format("Corrupted dictionary option: {}", k_str)); } } auto v_str = sstring(v.value.begin(), v.value.end()); } std::vector result; int i = 0; for (const auto& [k, v] : parts) { if (k != i) { throw sstables::malformed_sstable_exception(fmt::format("Missing dictionary part: expected {}, got {}", i, k)); } ++i; auto s = std::as_bytes(std::span(v)); result.insert(result.end(), s.begin(), s.end()); } return result; } std::map zstd_processor::options() const { std::map result = {{COMPRESSION_LEVEL, std::to_string(_compression_level)}}; std::optional> dict_blob; if (_cdict) { dict_blob = _cdict->raw(); } else if (_ddict) { dict_blob = _ddict->raw(); } if (dict_blob) { result.merge(dict_as_options(*dict_blob)); } return result; } std::map compressor::options() const { return {}; } std::optional compressor::get_dict_owner_for_test() const { return std::nullopt; } std::string compressor::name() const { return compression_parameters::algorithm_to_qualified_name(get_algorithm()); } bool compressor::is_hidden_option_name(std::string_view sv) { return sv.starts_with('.'); } const sstring compression_parameters::SSTABLE_COMPRESSION = "sstable_compression"; const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_in_kb"; const sstring compression_parameters::CHUNK_LENGTH_KB_ERR = "chunk_length_kb"; const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance"; compression_parameters::compression_parameters() : compression_parameters(algorithm::lz4) {} compression_parameters::~compression_parameters() {} compression_parameters::compression_parameters(algorithm alg) : compression_parameters::compression_parameters( alg == algorithm::none ? std::map{} : std::map{{sstring(SSTABLE_COMPRESSION), sstring(algorithm_to_name(alg))}} ) {} auto compression_parameters::name_to_algorithm(std::string_view name) -> algorithm { if (name.empty()) { return algorithm::none; } auto unqualified = sstring(unqualified_name(name_prefix, name)); for (int i = 0; i < static_cast(algorithm::none); ++i) { auto alg = static_cast(i); if (std::string_view(unqualified) == algorithm_to_name(alg)) { return alg; } } throw std::runtime_error(std::format("Unknown sstable_compression: {}", name)); } std::string_view compression_parameters::algorithm_to_name(algorithm alg) { switch (alg) { case algorithm::lz4: return "LZ4Compressor"; case algorithm::lz4_with_dicts: return "LZ4WithDictsCompressor"; case algorithm::deflate: return "DeflateCompressor"; case algorithm::snappy: return "SnappyCompressor"; case algorithm::zstd: return "ZstdCompressor"; case algorithm::zstd_with_dicts: return "ZstdWithDictsCompressor"; case algorithm::none: on_internal_error(compressor_factory_logger, "algorithm_to_name(): called with algorithm::none"); } abort(); } std::string compression_parameters::algorithm_to_qualified_name(algorithm alg) { auto short_name = compression_parameters::algorithm_to_name(alg); // For the cassandra-compatible compressors, we return the long name // ("org.apache.cassandra.io.compress.LZ4Compressor") for compatibility. // // For incompatible compressors, we only return the short name. // It wouldn't make sense to pretend they are a Java class. switch (alg) { case algorithm::lz4_with_dicts: case algorithm::zstd_with_dicts: return std::string(short_name); default: { auto result = std::string(name_prefix); result.append(short_name); return result; } } } compression_parameters::compression_parameters(const std::map& options) { std::set used_options; auto get_option = [&options, &used_options] (const sstring& x) -> const sstring* { used_options.insert(x); if (auto it = options.find(x); it != options.end()) { return &it->second; } return nullptr; }; if (auto v = get_option(SSTABLE_COMPRESSION)) { _algorithm = name_to_algorithm(*v); } else { _algorithm = algorithm::none; } const sstring* chunk_length = nullptr; if (auto v = get_option(CHUNK_LENGTH_KB_ERR)) { chunk_length = v; } if (auto v = get_option(CHUNK_LENGTH_KB)) { chunk_length = v; } if (chunk_length) { try { _chunk_length = std::stoi(*chunk_length) * 1024; } catch (const std::exception& e) { throw exceptions::syntax_exception(sstring("Invalid integer value ") + *chunk_length + " for " + CHUNK_LENGTH_KB); } } if (auto v = get_option(CRC_CHECK_CHANCE)) { try { _crc_check_chance = std::stod(*v); } catch (const std::exception& e) { throw exceptions::syntax_exception(sstring("Invalid double value ") + *v + "for " + CRC_CHECK_CHANCE); } } switch (_algorithm) { case algorithm::zstd_with_dicts: case algorithm::zstd: if (auto v = get_option(COMPRESSION_LEVEL)) { try { _zstd_compression_level = std::stoi(*v); } catch (const std::exception&) { throw exceptions::configuration_exception(format("Invalid integer value {} for {}", *v, COMPRESSION_LEVEL)); } } break; default: } for (const auto& o : options) { if (!used_options.contains(o.first)) { throw exceptions::configuration_exception(format("Unknown compression option '{}'.", o.first)); } } } void compression_parameters::validate(const gms::feature_service& fs) { if (!fs.sstable_compression_dicts) { if (_algorithm == algorithm::zstd_with_dicts || _algorithm == algorithm::lz4_with_dicts) { throw std::runtime_error(std::format("sstable_compression {} can't be used before " "all nodes are upgraded to a versions which supports it", algorithm_to_name(_algorithm))); } } if (_chunk_length) { auto chunk_length = _chunk_length.value(); if (chunk_length <= 0) { throw exceptions::configuration_exception( fmt::format("Invalid negative or null for {}/{}", CHUNK_LENGTH_KB, CHUNK_LENGTH_KB_ERR)); } // _chunk_length must be a power of two if (chunk_length & (chunk_length - 1)) { throw exceptions::configuration_exception( fmt::format("{}/{} must be a power of 2.", CHUNK_LENGTH_KB, CHUNK_LENGTH_KB_ERR)); } // Excessive _chunk_length is pointless and can lead to allocation // failures (see issue #9933) if (chunk_length > 128 * 1024) { throw exceptions::configuration_exception( fmt::format("{}/{} must be 128 or less.", CHUNK_LENGTH_KB, CHUNK_LENGTH_KB_ERR)); } } if (_crc_check_chance && (_crc_check_chance.value() < 0.0 || _crc_check_chance.value() > 1.0)) { throw exceptions::configuration_exception(sstring(CRC_CHECK_CHANCE) + " must be between 0.0 and 1.0."); } if (_zstd_compression_level) { if (*_zstd_compression_level != std::clamp(*_zstd_compression_level, ZSTD_minCLevel(), ZSTD_maxCLevel())) { throw exceptions::configuration_exception(fmt::format("{} must be between {} and {}, got {}", ZSTD_minCLevel(), ZSTD_maxCLevel(), COMPRESSION_LEVEL, *_zstd_compression_level)); } } } std::map compression_parameters::get_options() const { auto opts = std::map(); if (_algorithm != algorithm::none) { opts.emplace(compression_parameters::SSTABLE_COMPRESSION, algorithm_to_qualified_name(_algorithm)); } if (_zstd_compression_level) { opts.emplace(COMPRESSION_LEVEL, std::to_string(_zstd_compression_level.value())); } if (_chunk_length) { opts.emplace(sstring(CHUNK_LENGTH_KB), std::to_string(_chunk_length.value() / 1024)); } if (_crc_check_chance) { opts.emplace(sstring(CRC_CHECK_CHANCE), std::to_string(_crc_check_chance.value())); } return opts; } lz4_processor::lz4_processor(cdict_ptr cdict, ddict_ptr ddict) : _cdict(std::move(cdict)) , _ddict(std::move(ddict)) {} size_t lz4_processor::uncompress(const char* input, size_t input_len, char* output, size_t output_len) const { // We use LZ4_decompress_safe(). According to the documentation, the // function LZ4_decompress_fast() is slightly faster, but maliciously // crafted compressed data can cause it to overflow the output buffer. // Theoretically, our compressed data is created by us so is not malicious // (and accidental corruption is avoided by the compressed-data checksum), // but let's not take that chance for now, until we've actually measured // the performance benefit that LZ4_decompress_fast() would bring. // Cassandra's LZ4Compressor prepends to the chunk its uncompressed length // in 4 bytes little-endian (!) order. We don't need this information - // we already know the uncompressed data is at most the given chunk size // (and usually is exactly that, except in the last chunk). The advance // knowledge of the uncompressed size could be useful if we used // LZ4_decompress_fast(), but we prefer LZ4_decompress_safe() anyway... input += 4; input_len -= 4; int ret; if (_ddict) { ret = LZ4_decompress_safe_usingDict(input, output, input_len, output_len, reinterpret_cast(_ddict->raw().data()), _ddict->raw().size()); } else { SCYLLA_ASSERT(!_cdict && "Write-only compressor used for reading"); ret = LZ4_decompress_safe(input, output, input_len, output_len); } if (ret < 0) { throw std::runtime_error("LZ4 uncompression failure"); } return ret; } size_t lz4_processor::compress(const char* input, size_t input_len, char* output, size_t output_len) const { if (output_len < LZ4_COMPRESSBOUND(input_len) + 4) { throw std::runtime_error("LZ4 compression failure: length of output is too small"); } // Write input_len (32-bit data) to beginning of output in little-endian representation. output[0] = input_len & 0xFF; output[1] = (input_len >> 8) & 0xFF; output[2] = (input_len >> 16) & 0xFF; output[3] = (input_len >> 24) & 0xFF; int ret; if (_cdict) { auto* ctx = get_cctx(); LZ4_attach_dictionary(ctx, _cdict->dict()); ret = LZ4_compress_fast_continue(ctx, input, output + 4, input_len, LZ4_compressBound(input_len), 1); if (ret == 0) { LZ4_initStream(ctx, sizeof(*ctx)); } else { LZ4_resetStream_fast(ctx); } } else { SCYLLA_ASSERT(!_ddict && "Read-only compressor used for writing"); ret = LZ4_compress_default(input, output + 4, input_len, LZ4_compressBound(input_len)); } if (ret == 0) { throw std::runtime_error("LZ4 compression failure: LZ4_compress() failed"); } return ret + 4; } size_t lz4_processor::compress_max_size(size_t input_len) const { return LZ4_COMPRESSBOUND(input_len) + 4; } auto lz4_processor::get_algorithm() const -> algorithm { return (_cdict || _ddict) ? algorithm::lz4_with_dicts : algorithm::lz4; } std::map lz4_processor::options() const { std::optional> dict_blob; if (_cdict) { dict_blob = _cdict->raw(); } else if (_ddict) { dict_blob = _ddict->raw(); } if (dict_blob) { return dict_as_options(*dict_blob); } else { return {}; } } std::optional lz4_processor::get_dict_owner_for_test() const { if (_cdict) { return _cdict.get_owner_shard(); } else if (_ddict) { return _ddict.get_owner_shard(); } else { return std::nullopt; } } compressor_ptr make_lz4_sstable_compressor_for_tests() { return std::make_unique(); } size_t deflate_processor::uncompress(const char* input, size_t input_len, char* output, size_t output_len) const { z_stream zs; zs.zalloc = Z_NULL; zs.zfree = Z_NULL; zs.opaque = Z_NULL; zs.avail_in = 0; zs.next_in = Z_NULL; if (inflateInit(&zs) != Z_OK) { throw std::runtime_error("deflate uncompression init failure"); } // yuck, zlib is not const-correct, and also uses unsigned char while we use char :-( zs.next_in = reinterpret_cast(const_cast(input)); zs.avail_in = input_len; zs.next_out = reinterpret_cast(output); zs.avail_out = output_len; auto res = inflate(&zs, Z_FINISH); inflateEnd(&zs); if (res == Z_STREAM_END) { return output_len - zs.avail_out; } else { throw std::runtime_error("deflate uncompression failure"); } } size_t deflate_processor::compress(const char* input, size_t input_len, char* output, size_t output_len) const { z_stream zs; zs.zalloc = Z_NULL; zs.zfree = Z_NULL; zs.opaque = Z_NULL; zs.avail_in = 0; zs.next_in = Z_NULL; if (deflateInit(&zs, Z_DEFAULT_COMPRESSION) != Z_OK) { throw std::runtime_error("deflate compression init failure"); } zs.next_in = reinterpret_cast(const_cast(input)); zs.avail_in = input_len; zs.next_out = reinterpret_cast(output); zs.avail_out = output_len; auto res = ::deflate(&zs, Z_FINISH); deflateEnd(&zs); if (res == Z_STREAM_END) { return output_len - zs.avail_out; } else { throw std::runtime_error("deflate compression failure"); } } size_t deflate_processor::compress_max_size(size_t input_len) const { z_stream zs; zs.zalloc = Z_NULL; zs.zfree = Z_NULL; zs.opaque = Z_NULL; zs.avail_in = 0; zs.next_in = Z_NULL; if (deflateInit(&zs, Z_DEFAULT_COMPRESSION) != Z_OK) { throw std::runtime_error("deflate compression init failure"); } auto res = deflateBound(&zs, input_len); deflateEnd(&zs); return res; } size_t snappy_processor::uncompress(const char* input, size_t input_len, char* output, size_t output_len) const { if (snappy_uncompress(input, input_len, output, &output_len) == SNAPPY_OK) { return output_len; } else { throw std::runtime_error("snappy uncompression failure"); } } size_t snappy_processor::compress(const char* input, size_t input_len, char* output, size_t output_len) const { auto ret = snappy_compress(input, input_len, output, &output_len); if (ret != SNAPPY_OK) { throw std::runtime_error("snappy compression failure: snappy_compress() failed"); } return output_len; } size_t snappy_processor::compress_max_size(size_t input_len) const { return snappy_max_compressed_length(input_len); } // Holds weak pointers to all live dictionaries // (so that they can be cheaply shared with new SSTables if an identical dict is requested), // and shared (lifetime-extending) pointers to the current writer ("recommended") // dict for each table (so that they can be shared with new SSTables without consulting // `system.dicts`). // // Whenever a dictionary dies (because its refcount reaches 0), its weak pointer // is removed from the factory. // // Tracks the total memory usage of existing dicts. // // Has a configurable memory budget for live dicts. If the budget is exceeded, // will return null dicts to new writers (to avoid making the memory usage even worse) // and print warnings. class dictionary_holder : public weakly_referencable { mutable logger::rate_limit budget_warning_rate_limit{std::chrono::minutes(10)}; using config = default_sstable_compressor_factory::config; const config& _cfg; uint64_t _total_live_dict_memory = 0; metrics::metric_groups _metrics; struct zstd_cdict_id { dict_id id; int level; std::strong_ordering operator<=>(const zstd_cdict_id&) const = default; }; std::map _raw_dicts; std::map _zstd_cdicts; std::map _zstd_ddicts; std::map _lz4_cdicts; std::map>> _recommended; size_t memory_budget() const { return _cfg.memory_fraction_starting_at_which_we_stop_writing_dicts() * seastar::memory::stats().total_memory(); } bool memory_budget_exceeded() const { return _total_live_dict_memory >= memory_budget(); } void warn_budget_exceeded() const { compressor_factory_logger.log( log_level::warn, budget_warning_rate_limit, "Memory usage by live compression dicts ({} bytes) exceeds configured memory budget ({} bytes). Some new SSTables will fall back to compression without dictionaries.", _total_live_dict_memory, memory_budget() ); } public: lw_shared_ptr get_canonical_ptr(std::span dict) { if (dict.empty()) { return nullptr; } auto id = get_sha256(dict); if (auto it = _raw_dicts.find(id); it != _raw_dicts.end()) { return it->second->shared_from_this(); } else { auto p = make_lw_shared(*this, id, dict); _raw_dicts.emplace(id, p.get()); return p; } } using foreign_zstd_ddict = foreign_ptr>; foreign_zstd_ddict get_zstd_dict_for_reading(lw_shared_ptr raw, int level) { if (!raw) { return nullptr; } lw_shared_ptr ddict; // Fo reading, we must allocate a new dict, even if memory budget is exceeded. We have no other choice. // In any case, if the budget is exceeded after we print a rate-limited warning about it. if (auto it = _zstd_ddicts.find(raw->id()); it != _zstd_ddicts.end()) { ddict = it->second->shared_from_this(); } else { ddict = make_lw_shared(*this, raw); _zstd_ddicts.emplace(raw->id(), ddict.get()); } if (memory_budget_exceeded()) { warn_budget_exceeded(); compressor_factory_logger.debug("make_compressor_for_writing: falling back to no dict"); } return make_foreign(std::move(ddict)); } using foreign_zstd_cdict = foreign_ptr>; foreign_zstd_cdict get_zstd_dict_for_writing(lw_shared_ptr raw, int level) { if (!_cfg.enable_writing_dictionaries() || !raw) { return nullptr; } lw_shared_ptr cdict; // If we can share an already-allocated dict, we do that regardless of memory budget. // If we would have to allocate a new dict for writing, we only do that if we haven't exceeded // the budget yet. Otherwise we return null. if (auto it = _zstd_cdicts.find({raw->id(), level}); it != _zstd_cdicts.end()) { cdict = it->second->shared_from_this(); } else if (memory_budget_exceeded()) { warn_budget_exceeded(); compressor_factory_logger.debug("make_compressor_for_writing: falling back to no dict"); } else { cdict = make_lw_shared(*this, raw, level); _zstd_cdicts.emplace(zstd_cdict_id{raw->id(), level}, cdict.get()); } return make_foreign(std::move(cdict)); } using lz4_dicts = std::pair< foreign_ptr>, foreign_ptr> >; using foreign_lz4_ddict = foreign_ptr>; using foreign_lz4_cdict = foreign_ptr>; foreign_lz4_ddict get_lz4_dict_for_reading(lw_shared_ptr raw) { return make_foreign(std::move(raw)); } foreign_lz4_cdict get_lz4_dict_for_writing(lw_shared_ptr raw) { if (!_cfg.enable_writing_dictionaries() || !raw) { return nullptr; } lw_shared_ptr cdict; // If we can share an already-allocated dict, we do that regardless of memory budget. // If we would have to allocate a new dict for writing, we only do that if we haven't exceeded // the budget yet. Otherwise we return null. if (auto it = _lz4_cdicts.find(raw->id()); it != _lz4_cdicts.end()) { cdict = it->second->shared_from_this(); } else if (memory_budget_exceeded()) { warn_budget_exceeded(); } else { cdict = make_lw_shared(*this, raw); _lz4_cdicts.emplace(raw->id(), cdict.get()); } return make_foreign(std::move(cdict)); } public: dictionary_holder(const config& cfg) : _cfg(cfg) { if (_cfg.register_metrics) { namespace sm = seastar::metrics; _metrics.add_group("sstable_compression_dicts", { sm::make_counter("total_live_memory_bytes", _total_live_dict_memory, sm::description("Total amount of memory consumed by SSTable compression dictionaries in RAM")), }); } } dictionary_holder(dictionary_holder&&) = delete; ~dictionary_holder() { // Note: `_recommended` might be the only thing keeping some dicts alive, // so clearing it will destroy them. // // In the destructor, they will call back into us to erase themselves from the `std::map` // which map `table_id`s to dicts. // // Erasing from already-destroyed maps would be illegal, so the `_recommended` // must be cleared before the maps are destroyed. // // We could just rely on the member field destruction order for that, // but let's be explicit and clear it manually before any fields are destroyed. // (Calling back into the partially-destroyed factory would be iffy anyway, even if it was legal). _recommended.clear(); } void forget_raw_dict(dict_id id) { _raw_dicts.erase(id); } void forget_zstd_cdict(dict_id id, int level) { _zstd_cdicts.erase({id, level}); } void forget_zstd_ddict(dict_id id) { _zstd_ddicts.erase(id); } void forget_lz4_cdict(dict_id id) { _lz4_cdicts.erase(id); } void set_recommended_dict(table_id t, foreign_ptr> dict) { _recommended.erase(t); if (dict) { compressor_factory_logger.debug("set_recommended_dict: table={} size={} id={}", t, dict->raw().size(), fmt_hex(dict->id())); _recommended.emplace(t, std::move(dict)); } else { compressor_factory_logger.debug("set_recommended_dict: table={} size=0", t); } } future>> get_recommended_dict(table_id t) { auto rec_it = _recommended.find(t); if (rec_it == _recommended.end()) { co_return nullptr; } co_return co_await rec_it->second.copy(); } void account_memory_delta(ssize_t n) { if (static_cast(_total_live_dict_memory) + n < 0) { compressor_factory_logger.error( "Error in dictionary memory accounting: delta {} brings live memory {} below 0", n, _total_live_dict_memory); } _total_live_dict_memory += n; } }; default_sstable_compressor_factory::default_sstable_compressor_factory(config cfg) : _cfg(std::move(cfg)) , _holder(std::make_unique(_cfg)) { for (shard_id i = 0; i < smp::count; ++i) { auto numa_id = _cfg.numa_config[i]; _numa_groups.resize(std::max(_numa_groups.size(), numa_id + 1)); _numa_groups[numa_id].push_back(i); } } default_sstable_compressor_factory::~default_sstable_compressor_factory() { } std::vector default_sstable_compressor_factory_config::get_default_shard_to_numa_node_mapping() { auto sp = local_engine->smp().shard_to_numa_node_mapping(); return std::vector(sp.begin(), sp.end()); } unsigned default_sstable_compressor_factory::local_numa_id() { return _cfg.numa_config[this_shard_id()]; } shard_id default_sstable_compressor_factory::get_dict_owner(unsigned numa_id, const sha256_type& sha) { auto hash = read_unaligned(sha.data()); const auto& group = _numa_groups[numa_id]; if (group.empty()) { on_internal_error(compressor_factory_logger, "get_dict_owner called on an empty NUMA group"); } return group[hash % group.size()]; } future<> default_sstable_compressor_factory::set_recommended_dict_local(table_id t, std::span dict) { if (_leader_shard != this_shard_id()) { on_internal_error(compressor_factory_logger, fmt::format("set_recommended_dict_local called on wrong shard. Expected: {}, got {}", _leader_shard, this_shard_id())); } auto units = co_await get_units(_recommendation_setting_sem, 1); auto sha = get_sha256(dict); for (unsigned numa_id = 0; numa_id < _numa_groups.size(); ++numa_id) { const auto& group = _numa_groups[numa_id]; if (group.empty()) { continue; } auto r = get_dict_owner(numa_id, sha); auto d = co_await container().invoke_on(r, [dict](self& local) { return make_foreign(local._holder->get_canonical_ptr(dict)); }); auto local_coordinator = group[0]; co_await container().invoke_on(local_coordinator, coroutine::lambda([t, d = std::move(d)](self& local) mutable { local._holder->set_recommended_dict(t, std::move(d)); })); } } future<> default_sstable_compressor_factory::set_recommended_dict(table_id t, std::span dict) { return container().invoke_on(_leader_shard, &self::set_recommended_dict_local, t, dict); } future>> default_sstable_compressor_factory::get_recommended_dict(table_id t) { const auto local_coordinator = _numa_groups[local_numa_id()][0]; return container().invoke_on(local_coordinator, [t](self& local) { return local._holder->get_recommended_dict(t); }); } future default_sstable_compressor_factory::make_compressor_for_writing_impl(const compression_parameters& params, table_id id) { using algorithm = compression_parameters::algorithm; const auto algo = params.get_algorithm(); compressor_factory_logger.debug("make_compressor_for_writing: table={} algo={}", id, algo); switch (algo) { case algorithm::lz4: co_return std::make_unique(nullptr, nullptr); case algorithm::lz4_with_dicts: { holder::foreign_lz4_cdict cdict; if (auto recommended = co_await get_recommended_dict(id)) { cdict = co_await container().invoke_on(recommended.get_owner_shard(), [recommended = std::move(recommended)] (self& local) mutable { return local._holder->get_lz4_dict_for_writing(recommended.release()); }); } if (cdict) { compressor_factory_logger.debug("make_compressor_for_writing: using dict id={}", fmt_hex(cdict->id())); } co_return std::make_unique(std::move(cdict), nullptr); } case algorithm::deflate: co_return std::make_unique(); case algorithm::snappy: co_return std::make_unique(); case algorithm::zstd: co_return std::make_unique(params, nullptr, nullptr); case algorithm::zstd_with_dicts: { holder::foreign_zstd_cdict cdict; if (auto recommended = co_await get_recommended_dict(id)) { auto level = params.zstd_compression_level().value_or(ZSTD_defaultCLevel()); cdict = co_await container().invoke_on(recommended.get_owner_shard(), [level, recommended = std::move(recommended)] (self& local) mutable { return local._holder->get_zstd_dict_for_writing(recommended.release(), level); }); } if (cdict) { compressor_factory_logger.debug("make_compressor_for_writing: using dict id={}", fmt_hex(cdict->id())); } co_return std::make_unique(params, std::move(cdict), nullptr); } case algorithm::none: co_return nullptr; } abort(); } future default_sstable_compressor_factory::make_compressor_for_writing(schema_ptr s) { return make_compressor_for_writing_impl(s->get_compressor_params(), s->id()); } future default_sstable_compressor_factory::make_compressor_for_writing_for_tests(const compression_parameters& params, table_id id) { return make_compressor_for_writing_impl(params, id); } future default_sstable_compressor_factory::make_compressor_for_reading_impl(const compression_parameters& params, std::span dict) { using algorithm = compression_parameters::algorithm; const auto algo = params.get_algorithm(); switch (algo) { case algorithm::lz4: co_return std::make_unique(nullptr, nullptr); case algorithm::lz4_with_dicts: { auto dict_span = dict; auto sha = get_sha256(dict_span); auto dict_owner = get_dict_owner(local_numa_id(), sha); auto ddict = co_await container().invoke_on(dict_owner, [dict_span] (self& local) mutable { auto d = local._holder->get_canonical_ptr(dict_span); return local._holder->get_lz4_dict_for_reading(std::move(d)); }); if (ddict) { compressor_factory_logger.debug("make_compressor_for_reading: using dict id={}", fmt_hex(ddict->id())); } co_return std::make_unique(nullptr, std::move(ddict)); } case algorithm::deflate: co_return std::make_unique(); case algorithm::snappy: co_return std::make_unique(); case algorithm::zstd: { co_return std::make_unique(params, nullptr, nullptr); } case algorithm::zstd_with_dicts: { auto level = params.zstd_compression_level().value_or(ZSTD_defaultCLevel()); auto dict_span = dict; auto sha = get_sha256(dict_span); auto dict_owner = get_dict_owner(local_numa_id(), sha); auto ddict = co_await container().invoke_on(dict_owner, [level, dict_span] (self& local) mutable { auto d = local._holder->get_canonical_ptr(dict_span); return local._holder->get_zstd_dict_for_reading(std::move(d), level); }); if (ddict) { compressor_factory_logger.debug("make_compressor_for_reading: using dict id={}", fmt_hex(ddict->id())); } co_return std::make_unique(params, nullptr, std::move(ddict)); } case algorithm::none: co_return nullptr; } abort(); } future default_sstable_compressor_factory::make_compressor_for_reading(sstables::compression& c) { const auto params = compression_parameters(sstables::options_from_compression(c)); auto dict = dict_from_options(c); const auto algo = params.get_algorithm(); compressor_factory_logger.debug("make_compressor_for_reading: compression={} algo={}", fmt::ptr(&c), algo); co_return co_await make_compressor_for_reading_impl(params, std::as_bytes(std::span(*dict))); } future default_sstable_compressor_factory::make_compressor_for_reading_for_tests(const compression_parameters& params, std::span dict) { return make_compressor_for_reading_impl(params, dict); } raw_dict::raw_dict(dictionary_holder& owner, dict_id key, std::span dict) : _owner(owner.weak_from_this()) , _id(key) , _dict(dict.begin(), dict.end()) { _owner->account_memory_delta(malloc_usable_size(const_cast(_dict.data()))); } raw_dict::~raw_dict() { if (_owner) { _owner->forget_raw_dict(_id); _owner->account_memory_delta(-malloc_usable_size(const_cast(_dict.data()))); } } zstd_cdict::zstd_cdict(dictionary_holder& owner, lw_shared_ptr raw, int level) : _owner(owner.weak_from_this()) , _raw(raw) , _level(level) , _alloc([this] (ssize_t n) { _used_memory += n; if (_owner) { _owner->account_memory_delta(n); }}) , _dict( ZSTD_createCDict_advanced( _raw->raw().data(), _raw->raw().size(), ZSTD_dlm_byRef, ZSTD_dct_auto, ZSTD_getCParams(level, 4096, _raw->raw().size()), _alloc.as_zstd_custommem()), ZSTD_freeCDict) { if (!_dict) { throw std::bad_alloc(); } } zstd_cdict::~zstd_cdict() { if (_owner) { _owner->forget_zstd_cdict(_raw->id(), _level); // Note: memory delta will be accounted by the destruction of the allocated dict. } } zstd_ddict::zstd_ddict(dictionary_holder& owner, lw_shared_ptr raw) : _owner(owner.weak_from_this()) , _raw(raw) , _alloc([this] (ssize_t n) { _used_memory += n; if (_owner) { _owner->account_memory_delta(n); }}) , _dict( ZSTD_createDDict_advanced( _raw->raw().data(), _raw->raw().size(), ZSTD_dlm_byRef, ZSTD_dct_auto, _alloc.as_zstd_custommem()), ZSTD_freeDDict) { if (!_dict) { throw std::bad_alloc(); } } zstd_ddict::~zstd_ddict() { if (_owner) { _owner->forget_zstd_ddict(_raw->id()); // Note: memory delta will be accounted by the destruction of the allocated dict. } } lz4_cdict::lz4_cdict(dictionary_holder& owner, lw_shared_ptr raw) : _owner(owner.weak_from_this()) , _raw(raw) , _dict(LZ4_createStream(), LZ4_freeStream) { if (!_dict) { throw std::bad_alloc(); } LZ4_loadDictSlow(_dict.get(), reinterpret_cast(_raw->raw().data()), _raw->raw().size()); _owner->account_memory_delta(malloc_usable_size(_dict.get())); } lz4_cdict::~lz4_cdict() { if (_owner) { _owner->account_memory_delta(-malloc_usable_size(_dict.get())); _owner->forget_lz4_cdict(_raw->id()); } } std::unique_ptr make_sstable_compressor_factory_for_tests_in_thread() { SCYLLA_ASSERT(thread::running_in_thread()); struct wrapper : sstable_compressor_factory { using impl = default_sstable_compressor_factory; sharded _impl; future make_compressor_for_writing(schema_ptr s) override { return _impl.local().make_compressor_for_writing(s); } future make_compressor_for_reading(sstables::compression& c) override { return _impl.local().make_compressor_for_reading(c); } future<> set_recommended_dict(table_id t, std::span d) override { return _impl.local().set_recommended_dict(t, d); }; wrapper(wrapper&&) = delete; wrapper() { _impl.start().get(); } ~wrapper() { _impl.stop().get(); } }; return std::make_unique(); }