From f604269f0ac7de79e4db31c2cba84cc6d67c02ce Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@scylladb.com>
Date: Thu, 29 Jun 2023 16:12:44 +0300
Subject: [PATCH] cql3, secondary index: consistently choose index to use in a
 query

When a table has secondary indexes on *multiple* columns, and several
such columns are used for filtering in a query, Scylla chooses one
of these indexes as the main driver of the query, and the second
column's restriction is implemented as filtering.

Before this patch, the index to use was chosen fairly randomly, based on
the order of the indexes in the schema. This order may be different in
different coordinators, and may even change across restarts on the same
coordinators. This is not only inconsistent, it can cause outright wrong
results when using *paging* and switching (or restarting) coordinates
in the middle of a paged scan... One coordinator saves one index's key
in the paging state, and then the other coordinator gets this paging
state and wrongly believes it is supposed to be a key of a *different*
index.

The fix in this patch is to pick the index suitable for the first
indexed column mentioned in the query. This has two benefits over
the situation before the patch:

1. The decision of which index to use no longer changes between
   coordinators or across restarts - it just depends on the schema
   and the specific query.

2. Different indexes can have different "specificity" so using one
   or the other can change the query's performance. After this patch,
   the user is in control over which index is used by changing the
   order of terms in the query. A curious user can use tracing to
   check which index was used to implement a particular query.

An xfailing test we had for this issue no longer fails, so the "xfail"
marker is removed.

Fixes #7969

Signed-off-by: Nadav Har'El <nyh@scylladb.com>

Closes scylladb/scylladb#14450
---
 cql3/restrictions/statement_restrictions.cc | 40 +++++++++++++--------
 test/cql-pytest/test_secondary_index.py     |  3 +-
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/cql3/restrictions/statement_restrictions.cc b/cql3/restrictions/statement_restrictions.cc
index cd400bc0f9..7baffc5b0f 100644
--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -537,22 +537,32 @@ std::pair<std::optional<secondary_index::index>, expr::expression> statement_res
     int chosen_index_score = 0;
     expr::expression chosen_index_restrictions = expr::conjunction({});
 
-    for (const auto& index : sim.list_indexes()) {
-        auto cdef = _schema->get_column_definition(to_bytes(index.target_column()));
-        for (const expr::expression& restriction : index_restrictions()) {
-            if (has_partition_token(restriction, *_schema) || contains_multi_column_restriction(restriction)) {
-                continue;
-            }
-
-            expr::single_column_restrictions_map rmap = expr::get_single_column_restrictions_map(restriction);
-            const auto found = rmap.find(cdef);
-            if (found != rmap.end() && is_supported_by(found->second, index)
-                && score(index) > chosen_index_score) {
-                chosen_index = index;
-                chosen_index_score = score(index);
-                chosen_index_restrictions = restriction;
-            }
+    // Several indexes may be usable for this query. When their score is tied,
+    // let's pick one by order of the columns mentioned in the restriction
+    // expression. This specific order isn't important (and maybe in the
+    // future we could plan a better order based on the specificity of each
+    // index), but it is critical that two coordinators - or the same
+    // coordinator over time - must choose the same index for the same query.
+    // Otherwise, paging can break (see issue #7969).
+    for (const expr::expression& restriction : index_restrictions()) {
+        if (has_partition_token(restriction, *_schema) || contains_multi_column_restriction(restriction)) {
+            continue;
         }
+        expr::for_each_expression<expr::column_value>(restriction, [&](const expr::column_value& cval) {
+            auto& cdef = cval.col;
+            expr::expression col_restrictions = expr::conjunction {
+                .children = expr::extract_single_column_restrictions_for_column(restriction, *cdef)
+            };
+            for (const auto& index : sim.list_indexes()) {
+                if (cdef->name_as_text() == index.target_column() &&
+                        expr::is_supported_by(col_restrictions, index) &&
+                        score(index) > chosen_index_score) {
+                    chosen_index = index;
+                    chosen_index_score = score(index);
+                    chosen_index_restrictions = restriction;
+                }
+            }
+        });
     }
     return {chosen_index, chosen_index_restrictions};
 }
diff --git a/test/cql-pytest/test_secondary_index.py b/test/cql-pytest/test_secondary_index.py
index 4bb62cc57d..8220dbae94 100644
--- a/test/cql-pytest/test_secondary_index.py
+++ b/test/cql-pytest/test_secondary_index.py
@@ -66,8 +66,9 @@ def test_partition_order_with_si(cql, test_keyspace):
 # The order tested in this case was decided as a good first step in issue
 # #7969, but it's possible that it will eventually be implemented another
 # way, e.g. dynamically based on estimated query selectivity statistics.
+# In any case, the order must be consistent across coordinators and time
+# (and upgrades...) to allow paged queries that use an index to continue.
 # Ref: #7969
-@pytest.mark.xfail(reason="The order of picking indexes is currently arbitrary. Issue #7969")
 def test_order_of_indexes(scylla_only, cql, test_keyspace):
     schema = 'p int primary key, v1 int, v2 int, v3 int'
     with new_test_table(cql, test_keyspace, schema) as table: