tests: add test for reading parts of a large partition

This patch adds a test that takes an sstable with one partition of 13,520 clustering rows (spanning 700 KB in the data file), and attempts to read various slices CQL rows, counting that we got back the expected number of rows. The sstable included here was generated by Cassandra, and includes a promoted index. Promoted index reading is not supported yet (we will add it in the next patch), so for now the code will always read the entire partition from disk; But still the clustering-key filtering is already functional, and will drop some of the rows as requested, so this test will pass. Later, when we add promoted index support, we should check that this test still passes - promoted index will make the reads in this test more efficient (which the test cannot verify), but the important thing to check is that it doesn't break any of these tests. Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2016-08-07 14:46:38 +03:00
parent 3d45d6579b
commit 3dd079fb7a
9 changed files with 228 additions and 0 deletions
--- a/tests/sstable_test.cc
+++ b/tests/sstable_test.cc
@@ -37,6 +37,7 @@
 #include <memory>
 #include "sstable_test.hh"
 #include "tmpdir.hh"
+#include "partition_slice_builder.hh"

 #include "disk-error-handler.hh"

@@ -947,3 +948,221 @@ SEASTAR_TEST_CASE(statistics_rewrite) {
        });
    }, "tests/sstables/generation");
 }
+
+// Tests for reading a large partition for which the index contains a
+// "promoted index", i.e., a sample of the column names inside the partition,
+// with which we can avoid reading the entire partition when we look only
+// for a specific subset of columns. The test sstable for the read test was
+// generated in Cassandra.
+
+static schema_ptr large_partition_schema() {
+    static thread_local auto s = [] {
+        schema_builder builder(make_lw_shared(schema(
+                generate_legacy_id("try1", "data"), "try1", "data",
+        // partition key
+        {{"t1", utf8_type}},
+        // clustering key
+        {{"t2", utf8_type}},
+        // regular columns
+        {{"t3", utf8_type}},
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        ""
+       )));
+       return builder.build(schema_builder::compact_storage::no);
+    }();
+    return s;
+}
+
+static future<lw_shared_ptr<sstable>> load_large_partition_sst() {
+    auto sst = make_lw_shared<sstable>(
+            "try1", "data", "tests/sstables/large_partition", 3,
+            sstables::sstable::version_types::ka, big);
+    auto fut = sst->load();
+    return std::move(fut).then([sst = std::move(sst)] {
+        return std::move(sst);
+    });
+}
+
+// This is a rudimentary test that reads an sstable exported from Cassandra
+// which contains a promoted index. It just checks that the promoted index
+// is read from disk, as an unparsed array, and doesn't actually use it to
+// search for anything.
+SEASTAR_TEST_CASE(promoted_index_read) {
+    return load_large_partition_sst().then([] (auto sstp) {
+        schema_ptr s = large_partition_schema();
+        return sstables::test(sstp).read_indexes(0).then([sstp] (index_list vec) {
+            BOOST_REQUIRE(vec.size() == 1);
+            index_entry &e = vec[0];
+            BOOST_REQUIRE(e.get_promoted_index_bytes().size() == 468);
+        });
+    });
+}
+
+// Use an empty string for ck1, ck2, or both, for unbounded ranges.
+static query::partition_slice make_partition_slice(const schema& s, sstring ck1, sstring ck2) {
+    std::experimental::optional<query::clustering_range::bound> b1;
+    if (!ck1.empty()) {
+        b1.emplace(clustering_key_prefix::from_single_value(
+                s, utf8_type->decompose(ck1)));
+    }
+    std::experimental::optional<query::clustering_range::bound> b2;
+    if (!ck2.empty()) {
+        b2.emplace(clustering_key_prefix::from_single_value(
+                s, utf8_type->decompose(ck2)));
+    }
+    return partition_slice_builder(s).
+            with_range(query::clustering_range(b1, b2)).build();
+}
+
+// Count the number of CQL rows in one partition between clustering key
+// prefix ck1 to ck2.
+static future<int> count_rows(sstable_ptr sstp, schema_ptr s, sstring key, sstring ck1, sstring ck2) {
+    return seastar::async([sstp, s, key, ck1, ck2] () mutable {
+        auto ps = make_partition_slice(*s, ck1, ck2);
+        auto row = sstp->read_row(s, sstables::key(key.c_str()),
+               query::clustering_key_filtering_context::create(s, ps)).get0();
+        if (!row) {
+            return 0;
+        }
+        int nrows = 0;
+        auto mfopt = (*row)().get0();
+        while (mfopt) {
+            if (mfopt->is_clustering_row()) {
+                nrows++;
+            }
+            mfopt = (*row)().get0();
+        }
+        return nrows;
+    });
+}
+
+// Count the number of CQL rows in one partition
+static future<int> count_rows(sstable_ptr sstp, schema_ptr s, sstring key) {
+    return seastar::async([sstp, s, key] () mutable {
+        auto row = sstp->read_row(s, sstables::key(key.c_str())).get0();
+        if (!row) {
+            return 0;
+        }
+        int nrows = 0;
+        auto mfopt = (*row)().get0();
+        while (mfopt) {
+            if (mfopt->is_clustering_row()) {
+                nrows++;
+            }
+            mfopt = (*row)().get0();
+        }
+        return nrows;
+    });
+}
+
+// Count the number of CQL rows between clustering key prefix ck1 to ck2
+// in all partitions in the sstable (using sstable::read_range_rows).
+static future<int> count_rows(sstable_ptr sstp, schema_ptr s, sstring ck1, sstring ck2) {
+    return seastar::async([sstp, s, ck1, ck2] () mutable {
+        auto ps = make_partition_slice(*s, ck1, ck2);
+        auto reader = sstp->read_range_rows(s, query::full_partition_range,
+               query::clustering_key_filtering_context::create(s, ps));
+        int nrows = 0;
+        auto smopt = reader.read().get0();
+        while (smopt) {
+            auto mfopt = (*smopt)().get0();
+            while (mfopt) {
+                if (mfopt->is_clustering_row()) {
+                    nrows++;
+                }
+                mfopt = (*smopt)().get0();
+            }
+            smopt = reader.read().get0();
+        }
+        return nrows;
+    });
+}
+
+// This test reads, using sstable::read_row(), a slice (a range of clustering
+// rows) from one large partition in an sstable written in Cassandra.
+// This large partition includes 13520 clustering rows, and spans about
+// 700 KB on disk. When we ask to read only a part of it, the promoted index
+// (included in this sstable) may be used to allow reading only a part of the
+// partition from disk. This test doesn't directly verify that the promoted
+// index is actually used - and can work even without a promoted index
+// support - but can be used to check that adding promoted index read supports
+// did not break anything.
+// To verify that the promoted index was actually used to reduce the size
+// of read from disk, add printouts to the row reading code.
+SEASTAR_TEST_CASE(sub_partition_read) {
+    schema_ptr s = large_partition_schema();
+    return load_large_partition_sst().then([s] (auto sstp) {
+        return count_rows(sstp, s, "v1", "18wX", "18xB").then([] (int nrows) {
+            // there should be 5 rows (out of 13520 = 20*26*26) in this range:
+            // 18wX, 18wY, 18wZ, 18xA, 18xB.
+            BOOST_REQUIRE(nrows == 5);
+        }).then([sstp, s] () {
+            return count_rows(sstp, s, "v1", "13aB", "15aA").then([] (int nrows) {
+                // There should be 26*26*2 rows in this range. It spans two
+                // promoted-index blocks, so we get to test that case.
+                BOOST_REQUIRE(nrows == 2*26*26);
+            });
+        }).then([sstp, s] () {
+            return count_rows(sstp, s, "v1", "10aB", "19aA").then([] (int nrows) {
+                // There should be 26*26*9 rows in this range. It spans many
+                // promoted-index blocks.
+                BOOST_REQUIRE(nrows == 9*26*26);
+            });
+        }).then([sstp, s] () {
+            return count_rows(sstp, s, "v1", "0", "z").then([] (int nrows) {
+                // All rows, 20*26*26 of them, are in this range. It spans all
+                // the promoted-index blocks, but the range is still bounded
+                // on both sides
+                BOOST_REQUIRE(nrows == 20*26*26);
+            });
+        }).then([sstp, s] () {
+            // range that is outside (after) the actual range of the data.
+            // No rows should match.
+            return count_rows(sstp, s, "v1", "y", "z").then([] (int nrows) {
+                BOOST_REQUIRE(nrows == 0);
+            });
+        }).then([sstp, s] () {
+            // range that is outside (before) the actual range of the data.
+            // No rows should match.
+            return count_rows(sstp, s, "v1", "_a", "_b").then([] (int nrows) {
+                BOOST_REQUIRE(nrows == 0);
+            });
+        }).then([sstp, s] () {
+            // half-infinite range
+            return count_rows(sstp, s, "v1", "", "10aA").then([] (int nrows) {
+                BOOST_REQUIRE(nrows == (1*26*26 + 1));
+            });
+        }).then([sstp, s] () {
+            // half-infinite range
+            return count_rows(sstp, s, "v1", "10aA", "").then([] (int nrows) {
+                BOOST_REQUIRE(nrows == 19*26*26);
+            });
+        }).then([sstp, s] () {
+            // count all rows, but giving an explicit all-encompasing filter
+            return count_rows(sstp, s, "v1", "", "").then([] (int nrows) {
+                BOOST_REQUIRE(nrows == 20*26*26);
+            });
+        }).then([sstp, s] () {
+            // count all rows, without a filter
+            return count_rows(sstp, s, "v1").then([] (int nrows) {
+                BOOST_REQUIRE(nrows == 20*26*26);
+            });
+        });
+    });
+}
+
+// Same as previous test, just using read_range_rows instead of read_row
+// to read parts of potentially more than one partition (in this particular
+// sstable, there is actually just one partition).
+SEASTAR_TEST_CASE(sub_partitions_read) {
+    schema_ptr s = large_partition_schema();
+    return load_large_partition_sst().then([s] (auto sstp) {
+        return count_rows(sstp, s, "18wX", "18xB").then([] (int nrows) {
+            BOOST_REQUIRE(nrows == 5);
+        });
+    });
+}
--- a/tests/sstables/large_partition/try1-data-ka-3-CompressionInfo.db
+++ b/tests/sstables/large_partition/try1-data-ka-3-CompressionInfo.db
--- a/tests/sstables/large_partition/try1-data-ka-3-Data.db
+++ b/tests/sstables/large_partition/try1-data-ka-3-Data.db
--- a/tests/sstables/large_partition/try1-data-ka-3-Digest.sha1
+++ b/tests/sstables/large_partition/try1-data-ka-3-Digest.sha1
@@ -0,0 +1 @@
+2833048369
--- a/tests/sstables/large_partition/try1-data-ka-3-Filter.db
+++ b/tests/sstables/large_partition/try1-data-ka-3-Filter.db
--- a/tests/sstables/large_partition/try1-data-ka-3-Index.db
+++ b/tests/sstables/large_partition/try1-data-ka-3-Index.db
--- a/tests/sstables/large_partition/try1-data-ka-3-Statistics.db
+++ b/tests/sstables/large_partition/try1-data-ka-3-Statistics.db
--- a/tests/sstables/large_partition/try1-data-ka-3-Summary.db
+++ b/tests/sstables/large_partition/try1-data-ka-3-Summary.db
--- a/tests/sstables/large_partition/try1-data-ka-3-TOC.txt
+++ b/tests/sstables/large_partition/try1-data-ka-3-TOC.txt
@@ -0,0 +1,8 @@
+Data.db
+CompressionInfo.db
+Index.db
+Summary.db
+Statistics.db
+Digest.sha1
+TOC.txt
+Filter.db