From fb3682cb4f77620ffeac45d1715618ea4f9aa16f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 19 Feb 2015 14:30:07 -0500 Subject: [PATCH] sstable statistics file Signed-off-by: Glauber Costa Reviewed-by: Nadav Har'El --- sstables/sstables.cc | 74 ++++++++++++++++++++++++++++++++++++++++++++ sstables/sstables.hh | 2 ++ sstables/types.hh | 64 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) diff --git a/sstables/sstables.cc b/sstables/sstables.cc index 4defae9118..e021e91379 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -242,6 +242,74 @@ future<> parse(file_input_stream& in, summary& s) { }); } +future<> parse(file_input_stream& in, struct replay_position& rp) { + return parse(in, rp.segment, rp.position); +} + +future<> parse(file_input_stream& in, estimated_histogram::eh_elem &e) { + return parse(in, e.offset, e.bucket); +} + +future<> parse(file_input_stream& in, estimated_histogram &e) { + return parse(in, e.elements); +} + +future<> parse(file_input_stream& in, streaming_histogram &h) { + return parse(in, h.max_bin_size, h.hash); +} + +future<> parse(file_input_stream& in, validation_metadata& m) { + return parse(in, m.partitioner, m.filter_chance); +} + +future<> parse(file_input_stream& in, compaction_metadata& m) { + return parse(in, m.ancestors, m.cardinality); +} + +template +future<> parse(file_input_stream& in, std::unique_ptr& p) { + p.reset(new Child); + return parse(in, *static_cast(p.get())); +} + +future<> parse(file_input_stream& in, stats_metadata& m) { + return parse(in, + m.estimated_row_size, + m.estimated_column_count, + m.position, + m.min_timestamp, + m.max_timestamp, + m.max_local_deletion_time, + m.compression_ratio, + m.estimated_tombstone_drop_time, + m.sstable_level, + m.repaired_at, + m.min_column_names, + m.max_column_names, + m.has_legacy_counter_shards + ); +} + +future<> parse(file_input_stream& in, statistics& s) { + return parse(in, s.hash).then([&in, &s] { + return do_for_each(s.hash.map.begin(), s.hash.map.end(), [&in, &s] (auto val) mutable { + in.seek(val.second); + + switch (val.first) { + case metadata_type::Validation: + return parse(in, s.contents[val.first]); + case metadata_type::Compaction: + return parse(in, s.contents[val.first]); + case metadata_type::Stats: + return parse(in, s.contents[val.first]); + default: + sstlog.warn("Invalid metadata type at Statistics file: {} ", int(val.first)); + return make_ready_future<>(); + } + }); + }); +} + // This is small enough, and well-defined. Easier to just read it all // at once future<> sstable::read_toc() { @@ -334,8 +402,14 @@ future<> sstable::read_compression() { return read_simple(); } +future<> sstable::read_statistics() { + return read_simple(); +} + future<> sstable::load() { return read_toc().then([this] { + return read_statistics(); + }).then([this] { return read_compression(); }).then([this] { return read_filter(); diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 78f52c814f..2d498ab5ad 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -53,6 +53,7 @@ private: compression _compression; filter _filter; summary _summary; + statistics _statistics; sstring _dir; unsigned long _epoch = 0; @@ -74,6 +75,7 @@ private: future<> read_summary() { return read_simple(); } + future<> read_statistics(); public: sstable(sstring dir, unsigned long epoch, version_types v, format_types f) : _dir(dir), _epoch(epoch), _version(v), _format(f) {} diff --git a/sstables/types.hh b/sstables/types.hh index f9dd355903..a43830dbf3 100644 --- a/sstables/types.hh +++ b/sstables/types.hh @@ -82,4 +82,68 @@ struct summary_la { disk_array entries; }; using summary = summary_la; + +struct estimated_histogram { + struct eh_elem { + uint64_t offset; + uint64_t bucket; + }; + + disk_array elements; +}; + +struct replay_position { + uint64_t segment; + uint32_t position; +}; + +struct streaming_histogram { + uint32_t max_bin_size; + disk_hash hash; +}; + +struct metadata { +}; + +struct validation_metadata : public metadata { + disk_string partitioner; + double filter_chance; +}; + +struct compaction_metadata : public metadata { + disk_array ancestors; + disk_array cardinality; +}; + +struct la_stats_metadata : public metadata { + estimated_histogram estimated_row_size; + estimated_histogram estimated_column_count; + replay_position position; + uint64_t min_timestamp; + uint64_t max_timestamp; + uint32_t max_local_deletion_time; + double compression_ratio; + streaming_histogram estimated_tombstone_drop_time; + uint32_t sstable_level; + uint64_t repaired_at; + disk_array> min_column_names; + disk_array> max_column_names; + bool has_legacy_counter_shards; +}; +using stats_metadata = la_stats_metadata; + +// Numbers are found on disk, so they do matter. Also, setting their sizes of +// that of an uint32_t is a bit wasteful, but it simplifies the code a lot +// since we can now still use a strongly typed enum without introducing a +// notion of "disk-size" vs "memory-size". +enum class metadata_type : uint32_t { + Validation = 0, + Compaction = 1, + Stats = 2, +}; + +struct statistics { + disk_hash hash; + std::unordered_map> contents; +}; }