mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 00:20:47 +00:00
sstables: add LargeDataRecords metadata type (tag 13)
Add a new scylla metadata component LargeDataRecords (tag 13) that stores per-SSTable top-N large data records. Each record carries: - large_data_type (partition_size, row_size, cell_size, etc.) - binary serialized partition key and clustering key - column name (for cell records) - value (size in bytes) - element count (rows or collection elements, type-dependent) - range tombstones and dead rows (partition records only) The struct uses disk_string<uint32_t> for key/name fields and is serialized via the existing describe_type framework into the SSTable Scylla metadata component. Add JSON support in scylla-sstable and format documentation.
This commit is contained in:
@@ -33,6 +33,7 @@ in individual sections
|
||||
| ext_timestamp_stats
|
||||
| schema
|
||||
| components_digests
|
||||
| large_data_records
|
||||
|
||||
`sharding_metadata` (tag 1): describes what token sub-ranges are included in this
|
||||
sstable. This is used, when loading the sstable, to determine which shard(s)
|
||||
@@ -80,6 +81,11 @@ all SSTable component files that are checksummed during write. Each entry maps a
|
||||
type (e.g., Data, Index, Filter, Statistics, etc.) to its CRC32 checksum. This allows
|
||||
verifying the integrity of individual component files.
|
||||
|
||||
`large_data_records` (tag 13): an `array<large_data_record>` with the top-N individual large
|
||||
data entries (partitions, rows, cells) found during the sstable write. Unlike `large_data_stats`
|
||||
which only stores aggregate statistics, this records the actual keys and sizes so they survive
|
||||
tablet/shard migration.
|
||||
|
||||
The [scylla sstable dump-scylla-metadata](https://github.com/scylladb/scylladb/blob/master/docs/operating-scylla/admin-tools/scylla-sstable.rst#dump-scylla-metadata) tool
|
||||
can be used to dump the scylla metadata in JSON format.
|
||||
|
||||
@@ -203,3 +209,35 @@ in the statistics component, which lacks column names and other metadata. Unlike
|
||||
the full schema stored in the system schema tables, it is not intended to be
|
||||
comprehensive, but it contains enough information for tools like scylla-sstable
|
||||
to parse an sstable in a self-sufficient manner.
|
||||
|
||||
## large_data_records subcomponent
|
||||
|
||||
large_data_records = record_count large_data_record*
|
||||
record_count = be32
|
||||
large_data_record = large_data_type partition_key clustering_key column_name value elements_count range_tombstones dead_rows
|
||||
large_data_type = be32 // same enum as in large_data_stats
|
||||
partition_key = string32 // binary serialized partition key (sstables::key::get_bytes())
|
||||
clustering_key = string32 // binary serialized clustering key (clustering_key_prefix::representation()), empty if N/A
|
||||
column_name = string32 // column name as text, empty for partition/row entries
|
||||
value = be64 // size in bytes (partition, row, or cell size depending on type)
|
||||
elements_count = be64 // type-dependent element count (see below)
|
||||
range_tombstones = be64 // number of range tombstones (partition_size records only, 0 otherwise)
|
||||
dead_rows = be64 // number of dead rows (partition_size records only, 0 otherwise)
|
||||
string32 = string32_size byte*
|
||||
string32_size = be32
|
||||
|
||||
The large_data_records component holds individual top-N large data entries
|
||||
(partitions, rows, cells) found during the sstable write. Unlike large_data_stats,
|
||||
which only stores aggregate per-type statistics (max value, threshold, count above
|
||||
threshold), large_data_records preserves the actual partition key, clustering key,
|
||||
column name, and size for each above-threshold entry. This information is embedded
|
||||
in the sstable file itself and therefore survives tablet/shard migration.
|
||||
|
||||
The elements_count field carries a type-dependent element count:
|
||||
|
||||
- For partition_size and rows_in_partition records: number of rows in the partition
|
||||
- For cell_size and elements_in_collection records: number of elements in the collection (0 for non-collection cells)
|
||||
- For row_size records: 0
|
||||
|
||||
The range_tombstones and dead_rows fields are meaningful only for
|
||||
partition_size records and are zero for all other record types.
|
||||
|
||||
@@ -524,6 +524,7 @@ The content is dumped in JSON, using the following schema:
|
||||
"scylla_version": String
|
||||
"ext_timestamp_stats": {"$key": int64, ...}
|
||||
"sstable_identifier": String, // UUID
|
||||
"large_data_records": [$LARGE_DATA_RECORD, ...]
|
||||
}
|
||||
|
||||
$SHARDING_METADATA := {
|
||||
@@ -548,6 +549,17 @@ The content is dumped in JSON, using the following schema:
|
||||
"above_threshold": Uint
|
||||
}
|
||||
|
||||
$LARGE_DATA_RECORD := {
|
||||
"type": String, // large_data_type name
|
||||
"partition_key": String, // human-readable partition key (decoded from binary)
|
||||
"clustering_key": String, // human-readable clustering key (decoded from binary), empty if N/A
|
||||
"column_name": String, // column name, empty for partition/row entries
|
||||
"value": Uint64, // size in bytes (partition, row, or cell size depending on type)
|
||||
"elements_count": Uint64, // rows (partition_size, rows_in_partition) or collection elements (cell_size, elements_in_collection), 0 for row_size
|
||||
"range_tombstones": Uint64, // range tombstones (partition_size records only, 0 otherwise)
|
||||
"dead_rows": Uint64 // dead rows (partition_size records only, 0 otherwise)
|
||||
}
|
||||
|
||||
dump-schema
|
||||
^^^^^^^^^^^
|
||||
|
||||
|
||||
@@ -1664,6 +1664,10 @@ future<> sstable::open_data(sstable_open_config cfg) noexcept {
|
||||
if (ld_stats) {
|
||||
_large_data_stats.emplace(*ld_stats);
|
||||
}
|
||||
auto* ld_records = _components->scylla_metadata->data.get<scylla_metadata_type::LargeDataRecords, scylla_metadata::large_data_records>();
|
||||
if (ld_records) {
|
||||
_large_data_records.emplace(*ld_records);
|
||||
}
|
||||
auto* origin = _components->scylla_metadata->data.get<scylla_metadata_type::SSTableOrigin, scylla_metadata::sstable_origin>();
|
||||
if (origin) {
|
||||
_origin = sstring(to_string_view(bytes_view(origin->value)));
|
||||
@@ -2295,7 +2299,8 @@ static sstable_column_kind to_sstable_column_kind(column_kind k) {
|
||||
|
||||
void
|
||||
sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier,
|
||||
std::optional<scylla_metadata::large_data_stats> ld_stats, std::optional<scylla_metadata::ext_timestamp_stats> ts_stats) {
|
||||
std::optional<scylla_metadata::large_data_stats> ld_stats, std::optional<scylla_metadata::ext_timestamp_stats> ts_stats,
|
||||
std::optional<scylla_metadata::large_data_records> ld_records) {
|
||||
auto&& first_key = get_first_decorated_key();
|
||||
auto&& last_key = get_last_decorated_key();
|
||||
|
||||
@@ -2318,6 +2323,9 @@ sstable::write_scylla_metadata(shard_id shard, struct run_identifier identifier,
|
||||
if (ld_stats) {
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::LargeDataStats>(std::move(*ld_stats));
|
||||
}
|
||||
if (ld_records) {
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::LargeDataRecords>(std::move(*ld_records));
|
||||
}
|
||||
if (!_origin.empty()) {
|
||||
scylla_metadata::sstable_origin o;
|
||||
o.value = bytes(to_bytes_view(std::string_view(_origin)));
|
||||
|
||||
@@ -628,6 +628,7 @@ private:
|
||||
// It can be disengaged normally when loading legacy sstables that do not have this
|
||||
// information in their scylla metadata.
|
||||
std::optional<scylla_metadata::large_data_stats> _large_data_stats;
|
||||
std::optional<scylla_metadata::large_data_records> _large_data_records;
|
||||
sstring _origin;
|
||||
std::optional<scylla_metadata::ext_timestamp_stats> _ext_timestamp_stats;
|
||||
optimized_optional<sstable_id> _sstable_identifier;
|
||||
@@ -708,7 +709,8 @@ private:
|
||||
void write_scylla_metadata(shard_id shard,
|
||||
run_identifier identifier,
|
||||
std::optional<scylla_metadata::large_data_stats> ld_stats,
|
||||
std::optional<scylla_metadata::ext_timestamp_stats> ts_stats);
|
||||
std::optional<scylla_metadata::ext_timestamp_stats> ts_stats,
|
||||
std::optional<scylla_metadata::large_data_records> ld_records = std::nullopt);
|
||||
|
||||
future<> read_filter(sstable_open_config cfg = {});
|
||||
|
||||
@@ -1092,6 +1094,12 @@ public:
|
||||
// the map. Otherwise, return a disengaged optional.
|
||||
std::optional<large_data_stats_entry> get_large_data_stat(large_data_type t) const noexcept;
|
||||
|
||||
// Return the large_data_records stored in scylla_metadata, if present.
|
||||
// Absent on legacy SSTables that predate LargeDataRecords support.
|
||||
const std::optional<scylla_metadata::large_data_records>& get_large_data_records() const noexcept {
|
||||
return _large_data_records;
|
||||
}
|
||||
|
||||
// Return the extended timestamp statistics map.
|
||||
// Some or all entries may be missing if not present in scylla_metadata
|
||||
scylla_metadata::ext_timestamp_stats::map_type get_ext_timestamp_stats() const noexcept;
|
||||
|
||||
@@ -548,6 +548,7 @@ enum class scylla_metadata_type : uint32_t {
|
||||
SSTableIdentifier = 10,
|
||||
Schema = 11,
|
||||
ComponentsDigests = 12,
|
||||
LargeDataRecords = 13,
|
||||
};
|
||||
|
||||
// UUID is used for uniqueness across nodes, such that an imported sstable
|
||||
@@ -595,6 +596,31 @@ struct large_data_stats_entry {
|
||||
auto describe_type(sstable_version_types v, Describer f) { return f(max_value, threshold, above_threshold); }
|
||||
};
|
||||
|
||||
// A single top-N large data record stored in the SSTable's scylla metadata.
|
||||
// Records are written by the sstable writer and survive tablet/shard migration
|
||||
// because they live in the SSTable file itself rather than in a CQL system table.
|
||||
struct large_data_record {
|
||||
large_data_type type;
|
||||
disk_string<uint32_t> partition_key; // binary serialized partition key (sstables::key::get_bytes())
|
||||
disk_string<uint32_t> clustering_key; // binary serialized CK (clustering_key_prefix::representation()), empty if N/A
|
||||
disk_string<uint32_t> column_name; // column name as text, empty for partition/row entries
|
||||
uint64_t value; // size in bytes (partition, row, or cell size depending on type)
|
||||
// Type-dependent element count:
|
||||
// partition_size, rows_in_partition: number of rows in the partition
|
||||
// cell_size, elements_in_collection: number of elements in the collection (0 for non-collection cells)
|
||||
// row_size: 0
|
||||
uint64_t elements_count;
|
||||
// Partition-level auxiliary fields (meaningful only for partition_size records, 0 otherwise):
|
||||
uint64_t range_tombstones; // number of range tombstones in the partition
|
||||
uint64_t dead_rows; // number of dead rows in the partition
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(sstable_version_types v, Describer f) {
|
||||
return f(type, partition_key, clustering_key, column_name, value,
|
||||
elements_count, range_tombstones, dead_rows);
|
||||
}
|
||||
};
|
||||
|
||||
// Types of extended timestamp statistics.
|
||||
//
|
||||
// Note: For extensibility, never reuse an identifier,
|
||||
@@ -639,6 +665,7 @@ struct sstable_schema_type {
|
||||
struct scylla_metadata {
|
||||
using extension_attributes = disk_hash<uint32_t, disk_string<uint32_t>, disk_string<uint32_t>>;
|
||||
using large_data_stats = disk_hash<uint32_t, large_data_type, large_data_stats_entry>;
|
||||
using large_data_records = disk_array<uint32_t, large_data_record>;
|
||||
using sstable_origin = disk_string<uint32_t>;
|
||||
using scylla_build_id = disk_string<uint32_t>;
|
||||
using scylla_version = disk_string<uint32_t>;
|
||||
@@ -659,7 +686,8 @@ struct scylla_metadata {
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtTimestampStats, ext_timestamp_stats>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::SSTableIdentifier, sstable_identifier>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Schema, sstable_schema>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ComponentsDigests, components_digests>
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ComponentsDigests, components_digests>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::LargeDataRecords, large_data_records>
|
||||
> data;
|
||||
std::optional<uint32_t> digest;
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "db/corrupt_data_handler.hh"
|
||||
#include "db/object_storage_endpoint_param.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include "keys/keys.hh"
|
||||
#include "reader_concurrency_semaphore.hh"
|
||||
#include "readers/combined.hh"
|
||||
#include "readers/filtering.hh"
|
||||
@@ -1338,6 +1339,7 @@ const char* to_string(sstables::scylla_metadata_type t) {
|
||||
case sstables::scylla_metadata_type::SSTableIdentifier: return "sstable_identifier";
|
||||
case sstables::scylla_metadata_type::Schema: return "schema";
|
||||
case sstables::scylla_metadata_type::ComponentsDigests: return "components_digests";
|
||||
case sstables::scylla_metadata_type::LargeDataRecords: return "large_data_records";
|
||||
}
|
||||
std::abort();
|
||||
}
|
||||
@@ -1352,12 +1354,13 @@ const char* to_string(sstables::ext_timestamp_stats_type t) {
|
||||
|
||||
class scylla_metadata_visitor {
|
||||
json_writer& _writer;
|
||||
schema_ptr _schema;
|
||||
|
||||
dht::token as_token(const sstables::disk_string<uint16_t>& ds) const {
|
||||
return dht::token(dht::token::kind::key, bytes_view(ds));
|
||||
}
|
||||
public:
|
||||
scylla_metadata_visitor(json_writer& writer) : _writer(writer) { }
|
||||
scylla_metadata_visitor(json_writer& writer, schema_ptr schema = nullptr) : _writer(writer), _schema(std::move(schema)) { }
|
||||
|
||||
void operator()(const sstables::sharding_metadata& val) const {
|
||||
_writer.StartArray();
|
||||
@@ -1441,6 +1444,32 @@ public:
|
||||
}
|
||||
_writer.EndObject();
|
||||
}
|
||||
void operator()(const sstables::large_data_record& val) const {
|
||||
_writer.StartObject();
|
||||
_writer.Key("type");
|
||||
_writer.String(fmt::format("{}", val.type));
|
||||
_writer.Key("partition_key");
|
||||
auto pk = sstables::key_view(val.partition_key.value).to_partition_key(*_schema);
|
||||
_writer.String(key_to_str(pk, *_schema));
|
||||
_writer.Key("clustering_key");
|
||||
if (!val.clustering_key.value.empty()) {
|
||||
auto ck = clustering_key_prefix::from_bytes(val.clustering_key.value);
|
||||
_writer.String(key_to_str(ck, *_schema));
|
||||
} else {
|
||||
_writer.String("");
|
||||
}
|
||||
_writer.Key("column_name");
|
||||
_writer.String(disk_string_to_string(val.column_name));
|
||||
_writer.Key("value");
|
||||
_writer.Uint64(val.value);
|
||||
_writer.Key("elements_count");
|
||||
_writer.Uint64(val.elements_count);
|
||||
_writer.Key("range_tombstones");
|
||||
_writer.Uint64(val.range_tombstones);
|
||||
_writer.Key("dead_rows");
|
||||
_writer.Uint64(val.dead_rows);
|
||||
_writer.EndObject();
|
||||
}
|
||||
void operator()(const sstables::scylla_metadata::ext_timestamp_stats& val) const {
|
||||
_writer.StartObject();
|
||||
for (const auto& [k, v] : val.map) {
|
||||
@@ -1527,7 +1556,7 @@ void dump_scylla_metadata_operation(schema_ptr schema, reader_permit permit, con
|
||||
continue;
|
||||
}
|
||||
for (const auto& [k, v] : m->data.data) {
|
||||
std::visit(scylla_metadata_visitor(writer), v);
|
||||
std::visit(scylla_metadata_visitor(writer, schema), v);
|
||||
}
|
||||
if (m->digest.has_value()) {
|
||||
writer.Key("digest");
|
||||
|
||||
Reference in New Issue
Block a user