From faa45812b26dbed4e014d3f1cc4483de6da140a1 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Sun, 19 Jun 2016 16:47:58 +0300 Subject: [PATCH] Rewrite shared sstables only after entire CF is read Starting in commit 721f7d1d4f, we start "rewriting" a shared sstable (i.e., splitting it into individual shards) as soon as it is loaded in each shard. However as discovered in issue #1366, this is too soon: Our compaction process relies in several places that compaction is only done after all the sstables of the same CF have been loaded. One example is that we need to know the content of the other sstables to decide which tombstones we can expire (this is issue #1366). Another example is that we use the last generation number we are aware of to decide the number of the next compaction output - and this is wrong before we saw all sstables. So with this patch, while loading sstables we only make a list of shared sstables which need to be rewritten - and the actual rewrite is only started when we finish reading all the sstables for this CF. We need to do this in two cases: reboot (when we load all the existing sstables we find on disk), and nodetool referesh (when we import a set of new sstables). Fixes #1366. Signed-off-by: Nadav Har'El Message-Id: <1466344078-31290-1-git-send-email-nyh@scylladb.com> --- database.cc | 20 ++++++++++++++++++-- database.hh | 6 ++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/database.cc b/database.cc index f4c24ca63c..338d49aed6 100644 --- a/database.cc +++ b/database.cc @@ -523,8 +523,11 @@ future<> column_family::load_sstable(sstables::sstable&& sstab, bool reset_level // shard(s). Shared sstables cannot be deleted until all // shards compacted them, so to reduce disk space usage we // want to start splitting them now. - dblog.info("Splitting {} for shard", sst->get_filename()); - _compaction_manager.submit_sstable_rewrite(this, sst); + // However, we need to delay this compaction until we read all + // the sstables belonging to this CF, because we need all of + // them to know which tombstones we can drop, and what + // generation number is free. + _sstables_need_rewrite.push_back(sst); } if (reset_level) { // When loading a migrated sstable, set level to 0 because @@ -539,6 +542,17 @@ future<> column_family::load_sstable(sstables::sstable&& sstab, bool reset_level }); } +// load_sstable() wants to start rewriting sstables which are shared between +// several shards, but we can't start any compaction before all the sstables +// of this CF were loaded. So call this function to start rewrites, if any. +void column_family::start_rewrite() { + for (auto sst : _sstables_need_rewrite) { + dblog.info("Splitting {} for shard", sst->get_filename()); + _compaction_manager.submit_sstable_rewrite(this, sst); + } + _sstables_need_rewrite.clear(); +} + future column_family::probe_file(sstring sstdir, sstring fname) { using namespace sstables; @@ -1065,6 +1079,7 @@ column_family::load_new_sstables(std::vector new_tab _schema->ks_name(), _schema->cf_name(), _config.datadir, comps.generation, comps.version, comps.format), true); }).then([this] { + start_rewrite(); // Drop entire cache for this column family because it may be populated // with stale data. return get_row_cache().clear(); @@ -1265,6 +1280,7 @@ future<> column_family::populate(sstring sstdir) { }); }); }).then([this] { + start_rewrite(); // Make sure this is called even if CF is empty mark_ready_for_writes(); }); diff --git a/database.hh b/database.hh index db71222835..024df25f66 100644 --- a/database.hh +++ b/database.hh @@ -343,6 +343,11 @@ private: // have not been deleted yet, so must not GC any tombstones in other sstables // that may delete data in these sstables: std::vector _sstables_compacted_but_not_deleted; + // sstables that are shared between several shards so we want to rewrite + // them (split the data belonging to this shard to a separate sstable), + // but for correct compaction we need to start the compaction only after + // reading all sstables. + std::vector _sstables_need_rewrite; // Control background fibers waiting for sstables to be deleted seastar::gate _sstable_deletion_gate; // There are situations in which we need to stop writing sstables. Flushers will take @@ -372,6 +377,7 @@ private: void add_sstable(sstables::sstable&& sstable); void add_sstable(lw_shared_ptr sstable); future<> load_sstable(sstables::sstable&& sstab, bool reset_level = false); + void start_rewrite(); lw_shared_ptr new_memtable(); lw_shared_ptr new_streaming_memtable(); future try_flush_memtable_to_sstable(lw_shared_ptr memt);