Merge 'sstables/mx/writer: handle non-full prefix row keys' from Botond Dénes

Although valid for compact tables, non-full (or empty) clustering key prefixes are not handled for row keys when writing sstables. Only the present components are written, consequently if the key is empty, it is omitted entirely.
When parsing sstables, the parsing code unconditionally parses a full prefix.
This mis-match results in parsing failures, as the parser parses part of the row content as a key resulting in a garbage key and subsequent mis-parsing of the row content and maybe even subsequent partitions.

Introduce a new system table: `system.corrupt_data` and infrastructure similar to `large_data_handler`: `corrupt_data_handler` which abstracts how corrupt data is handled. The sstable writer now passes rows such corrupt keys to the corrupt data handler. This way, we avoid corrupting the sstables beyond parsing and the rows are also kept around in system.corrupt_data for later inspection and possible recovery.

Add a full-stack test which checks that rows with bad keys are correctly handled.

Fixes: https://github.com/scylladb/scylladb/issues/24489

The bug is present in all versions, has to be backported to all supported versions.

Closes scylladb/scylladb#24492

* github.com:scylladb/scylladb:
  test/boost/sstable_datafile_test: add test for corrupt data
  sstables/mx/writer: handler rows with empty keys
  test/lib/cql_assertions: introduce columns_assertions
  sstables: add corrupt_data_handler to sstables::sstables
  tools/scylla-sstable: make large_data_handler a local
  db: introduce corrupt_data_handler
  mutation: introduce frozen_mutation_fragment_v2
  mutation/mutation_partition_view: read_{clustering,static}_row(): return row type
  mutation/mutation_partition_view: extract de-ser of {clustering,static} row
  idl-compiler.py: generate skip() definition for enums serializers
  idl: extract full_position.idl from position_in_partition.idl
  db/system_keyspace: add apply_mutation()
  db/system_keyspace: introduce the corrupt_data table
This commit is contained in:
Avi Kivity
2025-06-29 18:18:36 +03:00
31 changed files with 804 additions and 65 deletions

View File

@@ -48,10 +48,13 @@
#include "test/lib/sstable_utils.hh"
#include "test/lib/random_utils.hh"
#include "test/lib/test_utils.hh"
#include "test/lib/cql_test_env.hh"
#include "readers/from_mutations.hh"
#include "readers/from_fragments.hh"
#include "readers/combined.hh"
#include "test/lib/random_schema.hh"
#include "test/lib/exception_utils.hh"
#include "test/lib/cql_assertions.hh"
namespace fs = std::filesystem;
@@ -3282,3 +3285,130 @@ SEASTAR_TEST_CASE(sstable_identifier_correctness) {
BOOST_REQUIRE_EQUAL(sst->sstable_identifier()->uuid(), sst->generation().as_uuid());
});
}
SEASTAR_TEST_CASE(test_non_full_and_empty_row_keys) {
return do_with_cql_env_thread([] (cql_test_env& env) {
const auto seed = tests::random::get_int<uint32_t>();
auto random_spec = tests::make_random_schema_specification(
"ks",
std::uniform_int_distribution<size_t>(1, 4),
std::uniform_int_distribution<size_t>(1, 4),
std::uniform_int_distribution<size_t>(2, 8),
std::uniform_int_distribution<size_t>(2, 8));
auto random_schema = tests::random_schema(seed, *random_spec);
testlog.info("Random schema:\n{}", random_schema.cql());
random_schema.create_with_cql(env).get();
auto schema = random_schema.schema();
auto& db = env.local_db();
auto& table = db.find_column_family(schema);
auto& manager = db.get_user_sstables_manager();
const auto generated_mutations = tests::generate_random_mutations(random_schema).get();
auto mutation_description = random_schema.new_mutation(0);
auto engine = std::mt19937(seed);
random_schema.add_row(engine, mutation_description, 0, [] (std::mt19937& engine, tests::timestamp_destination destination, api::timestamp_type min_timestamp) {
switch (destination) {
case tests::timestamp_destination::partition_tombstone:
case tests::timestamp_destination::row_tombstone:
case tests::timestamp_destination::collection_tombstone:
case tests::timestamp_destination::range_tombstone:
return api::missing_timestamp;
default:
return api::timestamp_type(100);
}
});
const auto row_mutation = mutation_description.build(schema);
auto check = [&] (const clustering_key& ck) {
testlog.info("check({})", ck);
auto permit = db.obtain_reader_permit(schema, "test_non_full_and_empty_row_keys::write", db::no_timeout, {}).get();
const auto dk = generated_mutations.front().decorated_key();
const auto row_mutation_fragment = mutation_fragment_v2(*schema, permit,
clustering_row(ck, deletable_row(*schema, row_mutation.partition().clustered_rows().begin()->row())));
std::deque<mutation_fragment_v2> fragments;
fragments.emplace_back(*schema, permit, partition_start(dk, {}));
fragments.emplace_back(*schema, permit, row_mutation_fragment);
fragments.emplace_back(*schema, permit, partition_end());
const auto original_mutation_fragment = mutation_fragment_v2(*schema, permit, fragments[1]);
auto reader = make_combined_reader(schema, permit,
make_mutation_reader_from_mutations(schema, permit, generated_mutations, query::full_partition_range),
make_mutation_reader_from_fragments(schema, permit, std::move(fragments)));
auto sst = table.make_sstable();
auto& corrupt_data_handler = sst->get_corrupt_data_handler();
const auto stats_before = corrupt_data_handler.get_stats();
sst->write_components(std::move(reader), generated_mutations.size(), schema, manager.configure_writer("test"), encoding_stats{}).get();
sst->load(schema->get_sharder(), {}).get();
testlog.info("mutations written to : {}", sst->get_filename());
// The sstable should not contain the row with the bad key -- that should be passed to the corrupt_data_handler.
assert_that(sst->make_reader(schema, permit, query::full_partition_range, schema->full_slice()))
.produces(generated_mutations);
const auto stats_after = corrupt_data_handler.get_stats();
for (const uint64_t db::corrupt_data_handler::stats::* member : {
&db::corrupt_data_handler::stats::corrupt_data_reported,
&db::corrupt_data_handler::stats::corrupt_data_recorded,
&db::corrupt_data_handler::stats::corrupt_clustering_rows_reported,
&db::corrupt_data_handler::stats::corrupt_clustering_rows_recorded}) {
BOOST_REQUIRE_EQUAL(stats_after.*member, stats_before.*member + 1);
}
auto res = env.execute_cql(format("SELECT * FROM {}.{} WHERE keyspace_name = '{}' AND table_name = '{}'",
db::system_keyspace::NAME, db::system_keyspace::CORRUPT_DATA, schema->ks_name(), schema->cf_name())).get();
assert_that(res)
.is_rows()
.with_size(1)
.with_columns_of_row(0)
.with_typed_column<timeuuid_native_type>("id", [] (const timeuuid_native_type& v) { return !v.uuid.is_null(); })
.with_typed_column<bytes>("partition_key", [&] (const bytes& v) {
return partition_key::from_bytes(v).equal(*schema, dk.key());
})
.with_typed_column<bytes>("clustering_key", [&] (const bytes& v) {
return clustering_key::from_bytes(v).equal(*schema, ck);
})
.with_typed_column<sstring>("mutation_fragment_kind", "clustering row")
.with_typed_column<bytes>("frozen_mutation_fragment", [&] (const bytes& v) {
bytes_ostream fmf_bytes;
fmf_bytes.write(v);
frozen_mutation_fragment_v2 fmf(std::move(fmf_bytes));
const auto unfreezed_mutation_fragment = fmf.unfreeze(*schema, permit);
return unfreezed_mutation_fragment.equal(*schema, original_mutation_fragment);
})
.with_typed_column<sstring>("origin", "sstable-write")
.with_typed_column<sstring>("sstable_name", fmt::to_string(sst->get_filename()))
;
// Clear the corrupt data table so that it doesn't affect other checks.
env.execute_cql(format("DELETE FROM {}.{} WHERE keyspace_name = '{}' AND table_name = '{}'",
db::system_keyspace::NAME, db::system_keyspace::CORRUPT_DATA, schema->ks_name(), schema->cf_name())).get();
};
check(clustering_key::make_empty());
if (schema->clustering_key_size() > 1) {
auto full_ckey = random_schema.make_ckey(0);
full_ckey.erase(full_ckey.end() - 1);
check(clustering_key::from_exploded(*schema, full_ckey));
}
});
}

View File

@@ -21,6 +21,47 @@ static inline void fail(sstring msg) {
throw std::runtime_error(msg);
}
void columns_assertions::fail(const sstring& msg) {
::fail(msg);
}
columns_assertions& columns_assertions::do_with_raw_column(const char* name, std::function<void(data_type, managed_bytes_view)> func) {
const auto& names = _metadata.get_names();
auto it = std::ranges::find_if(names, [name] (const auto& col) {
return col->name->text() == name;
});
if (it == names.end()) {
::fail(seastar::format("Column {} not found in metadata", name));
}
const size_t index = std::distance(names.begin(), it);
const auto& value = _columns.at(index);
if (!value) {
::fail(seastar::format("Column {} is null", name));
}
func((*it)->type, *value);
return *this;
}
columns_assertions& columns_assertions::with_raw_column(const char* name, std::function<bool(managed_bytes_view)> predicate) {
return do_with_raw_column(name, [name, &predicate] (data_type, managed_bytes_view value) {
if (!predicate(value)) {
::fail(seastar::format("Column {} failed predicate check: value = {}", name, value));
}
});
}
columns_assertions& columns_assertions::with_raw_column(const char* name, managed_bytes_view value) {
return do_with_raw_column(name, [name, &value] (data_type, managed_bytes_view cell_value) {
if (cell_value != value) {
::fail(seastar::format("Expected column {} to have value {}, but got {}", name, value, cell_value));
}
});
}
rows_assertions::rows_assertions(shared_ptr<cql_transport::messages::result_message::rows> rows)
: _rows(rows)
{ }
@@ -165,6 +206,11 @@ rows_assertions::with_rows_ignore_order(std::vector<std::vector<bytes_opt>> rows
return {*this};
}
columns_assertions rows_assertions::with_columns_of_row(size_t row_index) {
const auto& rs = _rows->rs().result_set();
return columns_assertions(rs.get_metadata(), rs.rows().at(row_index));
}
result_msg_assertions::result_msg_assertions(shared_ptr<cql_transport::messages::result_message> msg)
: _msg(msg)
{ }

View File

@@ -10,6 +10,7 @@
#pragma once
#include "utils/assert.hh"
#include "utils/managed_bytes.hh"
#include "test/lib/cql_test_env.hh"
#include "transport/messages/result_message_base.hh"
#include "bytes.hh"
@@ -17,6 +18,45 @@
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/future.hh>
class columns_assertions {
const cql3::metadata& _metadata;
const std::vector<managed_bytes_opt>& _columns;
columns_assertions& do_with_raw_column(const char* name, std::function<void(data_type, managed_bytes_view)> func);
void fail(const sstring& msg);
public:
columns_assertions(const cql3::metadata& metadata, const std::vector<managed_bytes_opt>& columns)
: _metadata(metadata), _columns(columns)
{ }
columns_assertions& with_raw_column(const char* name, std::function<bool(managed_bytes_view)> predicate);
columns_assertions& with_raw_column(const char* name, managed_bytes_view value);
template <typename T>
columns_assertions& with_typed_column(const char* name, std::function<bool(const T& value)> predicate) {
return do_with_raw_column(name, [this, name, predicate] (data_type type, managed_bytes_view value) {
if (type != data_type_for<T>()) {
fail(seastar::format("Column {} is not of type {}, but of type {}", name, data_type_for<T>()->name(), type->name()));
}
if (!predicate(value_cast<T>(type->deserialize(value)))) {
fail(seastar::format("Column {} failed predicate check: value = {}", name, value));
}
});
}
template <typename T>
columns_assertions& with_typed_column(const char* name, const T& value) {
return with_typed_column<T>(name, [this, name, &value] (const T& cell_value) {
if (cell_value != value) {
fail(seastar::format("Expected column {} to have value {}, but got {}", name, value, cell_value));
}
return true;
});
}
};
class rows_assertions {
shared_ptr<cql_transport::messages::result_message::rows> _rows;
public:
@@ -33,6 +73,8 @@ public:
rows_assertions with_rows_ignore_order(std::vector<std::vector<bytes_opt>> rows);
rows_assertions with_serialized_columns_count(size_t columns_count);
columns_assertions with_columns_of_row(size_t row_index);
rows_assertions is_null();
rows_assertions is_not_null();
};

View File

@@ -15,6 +15,7 @@
#include "data_dictionary/storage_options.hh"
#include "db/large_data_handler.hh"
#include "db/corrupt_data_handler.hh"
#include "sstables/version.hh"
#include "sstables/sstable_directory.hh"
#include "compaction/compaction_manager.hh"
@@ -95,6 +96,7 @@ public:
struct test_env_config {
db::large_data_handler* large_data_handler = nullptr;
db::corrupt_data_handler* corrupt_data_handler = nullptr;
data_dictionary::storage_options storage; // will be local by default
size_t available_memory = memory::stats().total_memory();
};

View File

@@ -13,6 +13,7 @@
#include "test/lib/test_utils.hh"
#include "db/config.hh"
#include "db/large_data_handler.hh"
#include "db/corrupt_data_handler.hh"
#include "dht/i_partitioner.hh"
#include "gms/feature_service.hh"
#include "repair/row_level.hh"
@@ -202,6 +203,7 @@ struct test_env::impl {
::cache_tracker cache_tracker;
gms::feature_service feature_service;
db::nop_large_data_handler nop_ld_handler;
db::nop_corrupt_data_handler nop_cd_handler;
sstable_compressor_factory& scf;
test_env_sstables_manager mgr;
std::unique_ptr<test_env_compaction_manager> cmgr;
@@ -225,10 +227,22 @@ test_env::impl::impl(test_env_config cfg, sstable_compressor_factory& scfarg, ss
, db_config(make_db_config(dir.path().native(), cfg.storage))
, dir_sem(1)
, feature_service(gms::feature_config_from_db_config(*db_config))
, nop_cd_handler(db::corrupt_data_handler::register_metrics::no)
, scf(scfarg)
, mgr("test_env", cfg.large_data_handler == nullptr ? nop_ld_handler : *cfg.large_data_handler, *db_config,
feature_service, cache_tracker, cfg.available_memory, dir_sem,
[host_id = locator::host_id::create_random_id()]{ return host_id; }, scf, abort, current_scheduling_group(), sstm)
, mgr(
"test_env",
cfg.large_data_handler == nullptr ? nop_ld_handler : *cfg.large_data_handler,
cfg.corrupt_data_handler == nullptr ? nop_cd_handler : *cfg.corrupt_data_handler,
*db_config,
feature_service,
cache_tracker,
cfg.available_memory,
dir_sem,
[host_id = locator::host_id::create_random_id()]{ return host_id; },
scf,
abort,
current_scheduling_group(),
sstm)
, semaphore(reader_concurrency_semaphore::no_limits{}, "sstables::test_env", reader_concurrency_semaphore::register_metrics::no)
, storage(std::move(cfg.storage))
{