diff --git a/docs/dev/sstable-scylla-format.md b/docs/dev/sstable-scylla-format.md index d3bc598218..1b20c324a0 100644 --- a/docs/dev/sstable-scylla-format.md +++ b/docs/dev/sstable-scylla-format.md @@ -33,6 +33,7 @@ in individual sections | ext_timestamp_stats | schema | components_digests + | large_data_records `sharding_metadata` (tag 1): describes what token sub-ranges are included in this sstable. This is used, when loading the sstable, to determine which shard(s) @@ -80,6 +81,11 @@ all SSTable component files that are checksummed during write. Each entry maps a type (e.g., Data, Index, Filter, Statistics, etc.) to its CRC32 checksum. This allows verifying the integrity of individual component files. +`large_data_records` (tag 13): an `array` with the top-N individual large +data entries (partitions, rows, cells) found during the sstable write. Unlike `large_data_stats` +which only stores aggregate statistics, this records the actual keys and sizes so they survive +tablet/shard migration. + The [scylla sstable dump-scylla-metadata](https://github.com/scylladb/scylladb/blob/master/docs/operating-scylla/admin-tools/scylla-sstable.rst#dump-scylla-metadata) tool can be used to dump the scylla metadata in JSON format. @@ -203,3 +209,35 @@ in the statistics component, which lacks column names and other metadata. Unlike the full schema stored in the system schema tables, it is not intended to be comprehensive, but it contains enough information for tools like scylla-sstable to parse an sstable in a self-sufficient manner. + +## large_data_records subcomponent + + large_data_records = record_count large_data_record* + record_count = be32 + large_data_record = large_data_type partition_key clustering_key column_name value elements_count range_tombstones dead_rows + large_data_type = be32 // same enum as in large_data_stats + partition_key = string32 // binary serialized partition key (sstables::key::get_bytes()) + clustering_key = string32 // binary serialized clustering key (clustering_key_prefix::representation()), empty if N/A + column_name = string32 // column name as text, empty for partition/row entries + value = be64 // size in bytes (partition, row, or cell size depending on type) + elements_count = be64 // type-dependent element count (see below) + range_tombstones = be64 // number of range tombstones (partition_size records only, 0 otherwise) + dead_rows = be64 // number of dead rows (partition_size records only, 0 otherwise) + string32 = string32_size byte* + string32_size = be32 + +The large_data_records component holds individual top-N large data entries +(partitions, rows, cells) found during the sstable write. Unlike large_data_stats, +which only stores aggregate per-type statistics (max value, threshold, count above +threshold), large_data_records preserves the actual partition key, clustering key, +column name, and size for each above-threshold entry. This information is embedded +in the sstable file itself and therefore survives tablet/shard migration. + +The elements_count field carries a type-dependent element count: + +- For partition_size and rows_in_partition records: number of rows in the partition +- For cell_size and elements_in_collection records: number of elements in the collection (0 for non-collection cells) +- For row_size records: 0 + +The range_tombstones and dead_rows fields are meaningful only for +partition_size records and are zero for all other record types. diff --git a/docs/operating-scylla/admin-tools/scylla-sstable.rst b/docs/operating-scylla/admin-tools/scylla-sstable.rst index 173fdeb3a3..57c1921bb2 100644 --- a/docs/operating-scylla/admin-tools/scylla-sstable.rst +++ b/docs/operating-scylla/admin-tools/scylla-sstable.rst @@ -524,6 +524,7 @@ The content is dumped in JSON, using the following schema: "scylla_version": String "ext_timestamp_stats": {"$key": int64, ...} "sstable_identifier": String, // UUID + "large_data_records": [$LARGE_DATA_RECORD, ...] } $SHARDING_METADATA := { @@ -548,6 +549,17 @@ The content is dumped in JSON, using the following schema: "above_threshold": Uint } + $LARGE_DATA_RECORD := { + "type": String, // large_data_type name + "partition_key": String, // human-readable partition key (decoded from binary) + "clustering_key": String, // human-readable clustering key (decoded from binary), empty if N/A + "column_name": String, // column name, empty for partition/row entries + "value": Uint64, // size in bytes (partition, row, or cell size depending on type) + "elements_count": Uint64, // rows (partition_size, rows_in_partition) or collection elements (cell_size, elements_in_collection), 0 for row_size + "range_tombstones": Uint64, // range tombstones (partition_size records only, 0 otherwise) + "dead_rows": Uint64 // dead rows (partition_size records only, 0 otherwise) + } + dump-schema ^^^^^^^^^^^ diff --git a/sstables/sstables.cc b/sstables/sstables.cc index f26420382b..b8553a540e 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -1664,6 +1664,10 @@ future<> sstable::open_data(sstable_open_config cfg) noexcept { if (ld_stats) { _large_data_stats.emplace(*ld_stats); } + auto* ld_records = _components->scylla_metadata->data.get(); + if (ld_records) { + _large_data_records.emplace(*ld_records); + } auto* origin = _components->scylla_metadata->data.get(); if (origin) { _origin = sstring(to_string_view(bytes_view(origin->value))); @@ -2295,7 +2299,8 @@ static sstable_column_kind to_sstable_column_kind(column_kind k) { void sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier, - std::optional ld_stats, std::optional ts_stats) { + std::optional ld_stats, std::optional ts_stats, + std::optional ld_records) { auto&& first_key = get_first_decorated_key(); auto&& last_key = get_last_decorated_key(); @@ -2318,6 +2323,9 @@ sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier, if (ld_stats) { _components->scylla_metadata->data.set(std::move(*ld_stats)); } + if (ld_records) { + _components->scylla_metadata->data.set(std::move(*ld_records)); + } if (!_origin.empty()) { scylla_metadata::sstable_origin o; o.value = bytes(to_bytes_view(std::string_view(_origin))); diff --git a/sstables/sstables.hh b/sstables/sstables.hh index fe499ca7dd..17a5551374 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -628,6 +628,7 @@ private: // It can be disengaged normally when loading legacy sstables that do not have this // information in their scylla metadata. std::optional _large_data_stats; + std::optional _large_data_records; sstring _origin; std::optional _ext_timestamp_stats; optimized_optional _sstable_identifier; @@ -708,7 +709,8 @@ private: void write_scylla_metadata(shard_id shard, run_identifier identifier, std::optional ld_stats, - std::optional ts_stats); + std::optional ts_stats, + std::optional ld_records = std::nullopt); future<> read_filter(sstable_open_config cfg = {}); @@ -1092,6 +1094,12 @@ public: // the map. Otherwise, return a disengaged optional. std::optional get_large_data_stat(large_data_type t) const noexcept; + // Return the large_data_records stored in scylla_metadata, if present. + // Absent on legacy SSTables that predate LargeDataRecords support. + const std::optional& get_large_data_records() const noexcept { + return _large_data_records; + } + // Return the extended timestamp statistics map. // Some or all entries may be missing if not present in scylla_metadata scylla_metadata::ext_timestamp_stats::map_type get_ext_timestamp_stats() const noexcept; diff --git a/sstables/types.hh b/sstables/types.hh index 68140a7dec..fad5d32779 100644 --- a/sstables/types.hh +++ b/sstables/types.hh @@ -548,6 +548,7 @@ enum class scylla_metadata_type : uint32_t { SSTableIdentifier = 10, Schema = 11, ComponentsDigests = 12, + LargeDataRecords = 13, }; // UUID is used for uniqueness across nodes, such that an imported sstable @@ -595,6 +596,31 @@ struct large_data_stats_entry { auto describe_type(sstable_version_types v, Describer f) { return f(max_value, threshold, above_threshold); } }; +// A single top-N large data record stored in the SSTable's scylla metadata. +// Records are written by the sstable writer and survive tablet/shard migration +// because they live in the SSTable file itself rather than in a CQL system table. +struct large_data_record { + large_data_type type; + disk_string partition_key; // binary serialized partition key (sstables::key::get_bytes()) + disk_string clustering_key; // binary serialized CK (clustering_key_prefix::representation()), empty if N/A + disk_string column_name; // column name as text, empty for partition/row entries + uint64_t value; // size in bytes (partition, row, or cell size depending on type) + // Type-dependent element count: + // partition_size, rows_in_partition: number of rows in the partition + // cell_size, elements_in_collection: number of elements in the collection (0 for non-collection cells) + // row_size: 0 + uint64_t elements_count; + // Partition-level auxiliary fields (meaningful only for partition_size records, 0 otherwise): + uint64_t range_tombstones; // number of range tombstones in the partition + uint64_t dead_rows; // number of dead rows in the partition + + template + auto describe_type(sstable_version_types v, Describer f) { + return f(type, partition_key, clustering_key, column_name, value, + elements_count, range_tombstones, dead_rows); + } +}; + // Types of extended timestamp statistics. // // Note: For extensibility, never reuse an identifier, @@ -639,6 +665,7 @@ struct sstable_schema_type { struct scylla_metadata { using extension_attributes = disk_hash, disk_string>; using large_data_stats = disk_hash; + using large_data_records = disk_array; using sstable_origin = disk_string; using scylla_build_id = disk_string; using scylla_version = disk_string; @@ -659,7 +686,8 @@ struct scylla_metadata { disk_tagged_union_member, disk_tagged_union_member, disk_tagged_union_member, - disk_tagged_union_member + disk_tagged_union_member, + disk_tagged_union_member > data; std::optional digest; diff --git a/tools/scylla-sstable.cc b/tools/scylla-sstable.cc index 3b3c306a41..1674dd81ab 100644 --- a/tools/scylla-sstable.cc +++ b/tools/scylla-sstable.cc @@ -30,6 +30,7 @@ #include "db/corrupt_data_handler.hh" #include "db/object_storage_endpoint_param.hh" #include "gms/feature_service.hh" +#include "keys/keys.hh" #include "reader_concurrency_semaphore.hh" #include "readers/combined.hh" #include "readers/filtering.hh" @@ -1338,6 +1339,7 @@ const char* to_string(sstables::scylla_metadata_type t) { case sstables::scylla_metadata_type::SSTableIdentifier: return "sstable_identifier"; case sstables::scylla_metadata_type::Schema: return "schema"; case sstables::scylla_metadata_type::ComponentsDigests: return "components_digests"; + case sstables::scylla_metadata_type::LargeDataRecords: return "large_data_records"; } std::abort(); } @@ -1352,12 +1354,13 @@ const char* to_string(sstables::ext_timestamp_stats_type t) { class scylla_metadata_visitor { json_writer& _writer; + schema_ptr _schema; dht::token as_token(const sstables::disk_string& ds) const { return dht::token(dht::token::kind::key, bytes_view(ds)); } public: - scylla_metadata_visitor(json_writer& writer) : _writer(writer) { } + scylla_metadata_visitor(json_writer& writer, schema_ptr schema = nullptr) : _writer(writer), _schema(std::move(schema)) { } void operator()(const sstables::sharding_metadata& val) const { _writer.StartArray(); @@ -1441,6 +1444,32 @@ public: } _writer.EndObject(); } + void operator()(const sstables::large_data_record& val) const { + _writer.StartObject(); + _writer.Key("type"); + _writer.String(fmt::format("{}", val.type)); + _writer.Key("partition_key"); + auto pk = sstables::key_view(val.partition_key.value).to_partition_key(*_schema); + _writer.String(key_to_str(pk, *_schema)); + _writer.Key("clustering_key"); + if (!val.clustering_key.value.empty()) { + auto ck = clustering_key_prefix::from_bytes(val.clustering_key.value); + _writer.String(key_to_str(ck, *_schema)); + } else { + _writer.String(""); + } + _writer.Key("column_name"); + _writer.String(disk_string_to_string(val.column_name)); + _writer.Key("value"); + _writer.Uint64(val.value); + _writer.Key("elements_count"); + _writer.Uint64(val.elements_count); + _writer.Key("range_tombstones"); + _writer.Uint64(val.range_tombstones); + _writer.Key("dead_rows"); + _writer.Uint64(val.dead_rows); + _writer.EndObject(); + } void operator()(const sstables::scylla_metadata::ext_timestamp_stats& val) const { _writer.StartObject(); for (const auto& [k, v] : val.map) { @@ -1527,7 +1556,7 @@ void dump_scylla_metadata_operation(schema_ptr schema, reader_permit permit, con continue; } for (const auto& [k, v] : m->data.data) { - std::visit(scylla_metadata_visitor(writer), v); + std::visit(scylla_metadata_visitor(writer, schema), v); } if (m->digest.has_value()) { writer.Key("digest");