diff --git a/database.cc b/database.cc index f4c24ca63c..338d49aed6 100644 --- a/database.cc +++ b/database.cc @@ -523,8 +523,11 @@ future<> column_family::load_sstable(sstables::sstable&& sstab, bool reset_level // shard(s). Shared sstables cannot be deleted until all // shards compacted them, so to reduce disk space usage we // want to start splitting them now. - dblog.info("Splitting {} for shard", sst->get_filename()); - _compaction_manager.submit_sstable_rewrite(this, sst); + // However, we need to delay this compaction until we read all + // the sstables belonging to this CF, because we need all of + // them to know which tombstones we can drop, and what + // generation number is free. + _sstables_need_rewrite.push_back(sst); } if (reset_level) { // When loading a migrated sstable, set level to 0 because @@ -539,6 +542,17 @@ future<> column_family::load_sstable(sstables::sstable&& sstab, bool reset_level }); } +// load_sstable() wants to start rewriting sstables which are shared between +// several shards, but we can't start any compaction before all the sstables +// of this CF were loaded. So call this function to start rewrites, if any. +void column_family::start_rewrite() { + for (auto sst : _sstables_need_rewrite) { + dblog.info("Splitting {} for shard", sst->get_filename()); + _compaction_manager.submit_sstable_rewrite(this, sst); + } + _sstables_need_rewrite.clear(); +} + future column_family::probe_file(sstring sstdir, sstring fname) { using namespace sstables; @@ -1065,6 +1079,7 @@ column_family::load_new_sstables(std::vector new_tab _schema->ks_name(), _schema->cf_name(), _config.datadir, comps.generation, comps.version, comps.format), true); }).then([this] { + start_rewrite(); // Drop entire cache for this column family because it may be populated // with stale data. return get_row_cache().clear(); @@ -1265,6 +1280,7 @@ future<> column_family::populate(sstring sstdir) { }); }); }).then([this] { + start_rewrite(); // Make sure this is called even if CF is empty mark_ready_for_writes(); }); diff --git a/database.hh b/database.hh index db71222835..024df25f66 100644 --- a/database.hh +++ b/database.hh @@ -343,6 +343,11 @@ private: // have not been deleted yet, so must not GC any tombstones in other sstables // that may delete data in these sstables: std::vector _sstables_compacted_but_not_deleted; + // sstables that are shared between several shards so we want to rewrite + // them (split the data belonging to this shard to a separate sstable), + // but for correct compaction we need to start the compaction only after + // reading all sstables. + std::vector _sstables_need_rewrite; // Control background fibers waiting for sstables to be deleted seastar::gate _sstable_deletion_gate; // There are situations in which we need to stop writing sstables. Flushers will take @@ -372,6 +377,7 @@ private: void add_sstable(sstables::sstable&& sstable); void add_sstable(lw_shared_ptr sstable); future<> load_sstable(sstables::sstable&& sstab, bool reset_level = false); + void start_rewrite(); lw_shared_ptr new_memtable(); lw_shared_ptr new_streaming_memtable(); future try_flush_memtable_to_sstable(lw_shared_ptr memt);