From 29ce425862b86d56bbc1f802b2b021a1aa75bd10 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 9 Aug 2015 22:00:44 +0300 Subject: [PATCH 1/3] db: introduce negative_mutation_reader concept Similar to a mutation_reader, but limited: it only returns whether a key is sure not to exist in some mutation source. Non-blocking and expected to execute fast. Corresponds to an sstable bloom filter. To avoid ambiguity, it doesn't return a bool, instead a longer but less ambiguous "definitely_doesnt_exists" or "maybe_exists". --- mutation_reader.hh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mutation_reader.hh b/mutation_reader.hh index d544407df3..925c44def0 100644 --- a/mutation_reader.hh +++ b/mutation_reader.hh @@ -89,3 +89,16 @@ future<> consume(mutation_reader& reader, Consumer consumer) { // can be queried multiple times and in parallel. For each query it returns // independent mutation_reader. using mutation_source = std::function; + +/// A negative_mutation_reader quickly returns whether a key is known not to exist +/// in a data source (it may return false positives, but not false negatives). +enum class negative_mutation_reader_result { + definitely_doesnt_exists, + maybe_exists +}; +using negative_mutation_reader = std::function; + +inline +negative_mutation_reader make_default_negative_mutation_reader() { + return [] (const partition_key& key) { return negative_mutation_reader_result::maybe_exists; }; +} From fee3a9513bdb615d2dfcce956a50d322a7b561ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 9 Aug 2015 22:03:01 +0300 Subject: [PATCH 2/3] sstables: add yet another variant of filter_has_key() This time public, for use when preloading the cache. --- sstables/sstables.hh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 866392e38a..92299191da 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -370,6 +370,9 @@ private: void write_range_tombstone(file_writer& out, const composite& clustering_prefix, std::vector suffix, const tombstone t); void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation::view collection); public: + bool filter_has_key(const schema& s, const partition_key& key) { + return filter_has_key(key::from_partition_key(s, key)); + } // Allow the test cases from sstable_test.cc to test private methods. We use // a placeholder to avoid cluttering this class too much. The sstable_test class // will then re-export as public every method it needs. From 1016b21089812d973f2da8865dd8ed03234a1cf7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 9 Aug 2015 22:05:05 +0300 Subject: [PATCH 3/3] cache: improve preloading of flushed memtable mutations If a mutation definitely doesn't exist in all sstables, then we can certainly load it into the cache. --- database.cc | 22 ++++++++++++++++++---- database.hh | 3 ++- row_cache.cc | 12 +++++++----- row_cache.hh | 2 +- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/database.cc b/database.cc index e3ac6eaa24..d1c3f604e4 100644 --- a/database.cc +++ b/database.cc @@ -58,6 +58,17 @@ column_family::column_family(schema_ptr schema, config config, no_commitlog cl) add_memtable(); } +negative_mutation_reader +column_family::make_negative_mutation_reader(lw_shared_ptr old_sstables) { + return [this, old_sstables = std::move(old_sstables)] (const partition_key& key) { + for (auto&& s : *old_sstables) { + if (s.second->filter_has_key(*_schema, key)) { + return negative_mutation_reader_result::maybe_exists; + } + } + return negative_mutation_reader_result::definitely_doesnt_exists; + }; +} mutation_source column_family::sstables_as_mutation_source() { @@ -369,9 +380,11 @@ void column_family::add_memtable() { } future<> -column_family::update_cache(memtable& m) { +column_family::update_cache(memtable& m, lw_shared_ptr old_sstables) { if (_config.enable_cache) { - return _cache.update(m); + // be careful to use the old sstable list, since the new one will hit every + // mutation in m. + return _cache.update(m, make_negative_mutation_reader(std::move(old_sstables))); } else { return make_ready_future<>(); } @@ -417,8 +430,9 @@ column_family::seal_active_memtable() { dblog.debug("Flushing done"); // We must add sstable before we call update_cache(), because // memtable's data after moving to cache can be evicted at any time. + auto old_sstables = _sstables; add_sstable(std::move(newtab)); - return update_cache(*old); + return update_cache(*old, std::move(old_sstables)); }).then_wrapped([this, old] (future<> ret) { try { ret.get(); @@ -1319,4 +1333,4 @@ future<> column_family::flush() { _stats.memtable_switch_count++; return make_ready_future<>(); }); -} \ No newline at end of file +} diff --git a/database.hh b/database.hh index 3f3c802464..d89e105937 100644 --- a/database.hh +++ b/database.hh @@ -123,7 +123,7 @@ private: void update_stats_for_new_sstable(uint64_t new_sstable_data_size); void add_sstable(sstables::sstable&& sstable); void add_memtable(); - future<> update_cache(memtable&); + future<> update_cache(memtable&, lw_shared_ptr old_sstables); struct merge_comparator; private: // Creates a mutation reader which covers sstables. @@ -132,6 +132,7 @@ private: mutation_reader make_sstable_reader(const query::partition_range& range) const; mutation_source sstables_as_mutation_source(); + negative_mutation_reader make_negative_mutation_reader(lw_shared_ptr old_sstables); public: // Creates a mutation reader which covers all data sources for this column family. // Caller needs to ensure that column_family remains live (FIXME: relax this). diff --git a/row_cache.cc b/row_cache.cc index ffe0dd93b9..9ace290ab6 100644 --- a/row_cache.cc +++ b/row_cache.cc @@ -145,9 +145,9 @@ void row_cache::populate(const mutation& m) { }); } -future<> row_cache::update(memtable& m) { +future<> row_cache::update(memtable& m, negative_mutation_reader underlying_negative) { _tracker.region().merge(m._region); // Now all data in memtable belongs to cache - with_allocator(_tracker.allocator(), [this, &m] { + with_allocator(_tracker.allocator(), [this, &m, underlying_negative = std::move(underlying_negative)] { auto i = m.partitions.begin(); const schema& s = *m.schema(); while (i != m.partitions.end()) { @@ -155,14 +155,16 @@ future<> row_cache::update(memtable& m) { // FIXME: Optimize knowing we lookup in-order. auto cache_i = _partitions.lower_bound(mem_e.key(), cache_entry::compare(_schema)); // If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete. - // FIXME: if the bloom filters say the data isn't in any sstable yet (other than the - // one we are caching now), we can. - // Alternatively, keep a bitmap indicating which sstables we do cover, so we don't have to + // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to // search it. if (cache_i != _partitions.end() && cache_i->key().equal(s, mem_e.key())) { cache_entry& entry = *cache_i; _tracker.touch(entry); entry.partition().apply(s, std::move(mem_e.partition())); + } else if (underlying_negative(mem_e.key().key()) == negative_mutation_reader_result::definitely_doesnt_exists) { + cache_entry* entry = current_allocator().construct(mem_e.key(), mem_e.partition()); + _tracker.insert(*entry); + _partitions.insert(cache_i, *entry); } i = m.partitions.erase(i); current_allocator().destroy(&mem_e); diff --git a/row_cache.hh b/row_cache.hh index a126ffac43..b216243ffe 100644 --- a/row_cache.hh +++ b/row_cache.hh @@ -148,5 +148,5 @@ public: // has just been flushed to the underlying data source. // The memtable can be queried during the process, but must not be written. // After the update is complete, memtable is empty. - future<> update(memtable&); + future<> update(memtable&, negative_mutation_reader underlying_negative); };