Files
scylladb/sstables/metadata_collector.hh
Avi Kivity 0ae22a09d4 LICENSE: Update to version 1.1
Updated terms of non-commercial use (must be a never-customer).
2026-04-12 19:46:33 +03:00

272 lines
10 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*
* Modified by ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
*/
#pragma once
#include "sstables/types.hh"
#include "sstables/component_type.hh"
#include "mutation/timestamp.hh"
#include "utils/extremum_tracking.hh"
#include "utils/murmur_hash.hh"
#include "hyperloglog.hh"
#include "db/commitlog/replay_position.hh"
#include "mutation/position_in_partition.hh"
#include "locator/host_id.hh"
namespace sstables {
static constexpr int TOMBSTONE_HISTOGRAM_BIN_SIZE = 100;
/**
* ColumnStats holds information about the columns for one partition inside sstable
*/
struct column_stats {
/** how many atomic cells are there in the partition (every cell in a collection counts)*/
uint64_t cells_count;
/** how many columns are there in the partition */
uint64_t column_count;
/** how many rows (including range tombstone markers) are there in the partition */
uint64_t rows_count;
/** how many range tombstones are there in the partition */
uint64_t range_tombstones_count;
/** how many dead rows are there in the partition */
uint64_t dead_rows_count;
uint64_t start_offset;
uint64_t partition_size;
/** the largest/smallest (client-supplied) timestamp in the partition */
min_max_tracker<api::timestamp_type> timestamp_tracker;
/** the largest/smallest (client-supplied) timestamp of live data in the partition, for the purpose of tombstone garbage collection **/
min_tracker<api::timestamp_type> min_live_timestamp_tracker;
/** the largest/smallest (client-supplied) timestamp of live data that would shadow shadowable tomebstones in the partition,
** for the purpose of tombstone garbage collection of shadowable tombstones **/
min_tracker<api::timestamp_type> min_live_row_marker_timestamp_tracker;
min_max_tracker<int32_t> local_deletion_time_tracker;
min_max_tracker<int32_t> ttl_tracker;
/** histogram of tombstone drop time */
utils::streaming_histogram tombstone_histogram;
bool has_legacy_counter_shards;
bool capped_local_deletion_time = false;
column_stats() :
cells_count(0),
column_count(0),
rows_count(0),
range_tombstones_count(0),
dead_rows_count(0),
start_offset(0),
partition_size(0),
min_live_timestamp_tracker(api::max_timestamp),
min_live_row_marker_timestamp_tracker(api::max_timestamp),
tombstone_histogram(TOMBSTONE_HISTOGRAM_BIN_SIZE),
has_legacy_counter_shards(false)
{
}
void reset() {
*this = column_stats();
}
void update_timestamp(api::timestamp_type value, is_live is_live) {
timestamp_tracker.update(value);
if (is_live) {
min_live_timestamp_tracker.update(value);
}
}
void update_live_row_marker_timestamp(api::timestamp_type value) {
min_live_row_marker_timestamp_tracker.update(value);
}
void update_local_deletion_time(int32_t value) {
local_deletion_time_tracker.update(value);
}
void update_local_deletion_time_and_tombstone_histogram(gc_clock::time_point value) {
bool capped;
int32_t ldt = adjusted_local_deletion_time(value, capped);
local_deletion_time_tracker.update(ldt);
tombstone_histogram.update(ldt);
capped_local_deletion_time |= capped;
}
void update_ttl(int32_t value) {
ttl_tracker.update(value);
}
void update_ttl(gc_clock::duration value) {
ttl_tracker.update(gc_clock::as_int32(value));
}
void do_update(const tombstone& t) {
update_timestamp(t.timestamp, is_live::no);
update_local_deletion_time_and_tombstone_histogram(t.deletion_time);
}
void update(const tombstone& t) {
if (t) {
do_update(t);
}
}
};
class metadata_collector {
public:
static constexpr double NO_COMPRESSION_RATIO = -1.0;
static hll::HyperLogLog hyperloglog(int p, int sp) {
// FIXME: hll::HyperLogLog doesn't support sparse format, so ignoring parameters by the time being.
return hll::HyperLogLog();
}
private:
const schema& _schema;
component_name _name;
locator::host_id _host_id;
// EH of 150 can track a max value of 1697806495183, i.e., > 1.5PB
utils::estimated_histogram _estimated_partition_size{150};
// EH of 114 can track a max value of 2395318855, i.e., > 2B cells
utils::estimated_histogram _estimated_cells_count{114};
db::replay_position _replay_position;
min_max_tracker<api::timestamp_type> _timestamp_tracker;
min_tracker<api::timestamp_type> _min_live_timestamp_tracker;
min_tracker<api::timestamp_type> _min_live_row_marker_timestamp_tracker;
min_max_tracker<int32_t> _local_deletion_time_tracker{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::max()};
min_max_tracker<int32_t> _ttl_tracker{0, 0};
double _compression_ratio = NO_COMPRESSION_RATIO;
utils::streaming_histogram _estimated_tombstone_drop_time{TOMBSTONE_HISTOGRAM_BIN_SIZE};
int _sstable_level = 0;
int64_t _repaired_at = 0;
std::optional<position_in_partition> _min_clustering_pos;
std::optional<position_in_partition> _max_clustering_pos;
bool _has_legacy_counter_shards = false;
uint64_t _columns_count = 0;
uint64_t _rows_count = 0;
/**
* Default cardinality estimation method is to use HyperLogLog++.
* Parameter here(p=13, sp=25) should give reasonable estimation
* while lowering bytes required to hold information.
* See CASSANDRA-5906 for detail.
*/
hll::HyperLogLog _cardinality = hyperloglog(13, 25);
private:
void convert(disk_array<uint32_t, disk_string<uint16_t>>&to, const std::optional<position_in_partition>& from);
public:
explicit metadata_collector(const schema& schema, component_name name, const locator::host_id& host_id)
: _schema(schema)
, _name(name)
, _host_id(host_id)
, _min_live_timestamp_tracker(api::max_timestamp)
, _min_live_row_marker_timestamp_tracker(api::max_timestamp)
{
if (!schema.clustering_key_size()) {
_min_clustering_pos.emplace(position_in_partition_view::before_all_clustered_rows());
_max_clustering_pos.emplace(position_in_partition_view::after_all_clustered_rows());
}
}
const schema& get_schema() {
return _schema;
}
void add_key(bytes_view key) {
long hashed = utils::murmur_hash::hash2_64(key, 0);
_cardinality.offer_hashed(hashed);
}
void add_partition_size(uint64_t partition_size) {
_estimated_partition_size.add(partition_size);
}
void add_cells_count(uint64_t cells_count) {
_estimated_cells_count.add(cells_count);
}
void merge_tombstone_histogram(utils::streaming_histogram& histogram) {
_estimated_tombstone_drop_time.merge(histogram);
}
/**
* Ratio is compressed/uncompressed and it is
* if you have 1.x then compression isn't helping
*/
void add_compression_ratio(uint64_t compressed, uint64_t uncompressed) {
_compression_ratio = (double) compressed/uncompressed;
}
void set_replay_position(const db::replay_position & rp) {
_replay_position = rp;
}
void set_sstable_level(int sstable_level) {
_sstable_level = sstable_level;
}
void set_repaired_at(int64_t repaired_at) {
_repaired_at = repaired_at;
}
void update_has_legacy_counter_shards(bool has_legacy_counter_shards) {
_has_legacy_counter_shards = _has_legacy_counter_shards || has_legacy_counter_shards;
}
// pos must be in the clustered region
void update_min_max_components(position_in_partition_view pos);
void update(column_stats&& stats) {
_timestamp_tracker.update(stats.timestamp_tracker);
_min_live_timestamp_tracker.update(stats.min_live_timestamp_tracker);
_min_live_row_marker_timestamp_tracker.update(stats.min_live_row_marker_timestamp_tracker);
_local_deletion_time_tracker.update(stats.local_deletion_time_tracker);
_ttl_tracker.update(stats.ttl_tracker);
add_partition_size(stats.partition_size);
add_cells_count(stats.cells_count);
merge_tombstone_histogram(stats.tombstone_histogram);
update_has_legacy_counter_shards(stats.has_legacy_counter_shards);
_columns_count += stats.column_count;
_rows_count += stats.rows_count;
}
void construct_compaction(compaction_metadata& m) {
auto cardinality = _cardinality.get_bytes();
m.cardinality.elements = utils::chunked_vector<uint8_t>(cardinality.get(), cardinality.get() + cardinality.size());
}
void construct_stats(stats_metadata& m) {
m.estimated_partition_size = std::move(_estimated_partition_size);
m.estimated_cells_count = std::move(_estimated_cells_count);
m.position = _replay_position;
m.min_timestamp = _timestamp_tracker.min();
m.max_timestamp = _timestamp_tracker.max();
m.min_local_deletion_time = _local_deletion_time_tracker.min();
m.max_local_deletion_time = _local_deletion_time_tracker.max();
m.min_ttl = _ttl_tracker.min();
m.max_ttl = _ttl_tracker.max();
m.compression_ratio = _compression_ratio;
m.estimated_tombstone_drop_time = std::move(_estimated_tombstone_drop_time);
m.sstable_level = _sstable_level;
m.repaired_at = _repaired_at;
convert(m.min_column_names, _min_clustering_pos);
convert(m.max_column_names, _max_clustering_pos);
m.has_legacy_counter_shards = _has_legacy_counter_shards;
m.columns_count = _columns_count;
m.rows_count = _rows_count;
m.originating_host_id = _host_id;
}
scylla_metadata::ext_timestamp_stats::map_type get_ext_timestamp_stats() {
return scylla_metadata::ext_timestamp_stats::map_type{
{ ext_timestamp_stats_type::min_live_timestamp, _min_live_timestamp_tracker.get() },
{ ext_timestamp_stats_type::min_live_row_marker_timestamp, _min_live_row_marker_timestamp_tracker.get() },
};
}
};
}