From 63d58022a6cfcb5cd372d30bc58a2f3d5ea35f56 Mon Sep 17 00:00:00 2001 From: Sergey Zolotukhin Date: Wed, 18 Dec 2024 15:50:03 +0100 Subject: [PATCH] test: Add test case for checking read repair diff calculation when having conflicting keys. The test updates two rows with keys that result in a Murmur3 hash collision, which is used to generate Scylla tokens. These tokens are involved in read repair diff calculations. Due to the identical token values, a hash map key collision occurs. Consequently, an incorrect value from the second row (with a different primary key) is then sent for writing as 'repaired', causing data corruption. (cherry picked from commit e577f1d1414d640a68fd5adaef8b3220be72bdf7) --- .../test_conflicting_keys_read_repair.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 test/topology_custom/test_conflicting_keys_read_repair.py diff --git a/test/topology_custom/test_conflicting_keys_read_repair.py b/test/topology_custom/test_conflicting_keys_read_repair.py new file mode 100644 index 0000000000..2c792dfee3 --- /dev/null +++ b/test/topology_custom/test_conflicting_keys_read_repair.py @@ -0,0 +1,82 @@ +# +# Copyright (C) 2024-present ScyllaDB +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# +import logging +import pytest +import asyncio +import time + +from cassandra import ConsistencyLevel # type: ignore +from cassandra.query import SimpleStatement # type: ignore +from test.pylib.manager_client import ManagerClient +from test.pylib.util import wait_for_cql_and_get_hosts + + +logger = logging.getLogger(__name__) + + +@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/19101") +@pytest.mark.asyncio +async def test_read_repair_with_conflicting_hash_keys(request: pytest.FixtureRequest, manager: ManagerClient) -> None: + """ + Test that conflicting hash keys are handled correctly during read repair. + Issue https://github.com/scylladb/scylladb/issues/19101 + + 1. Create a new cluster with 3 nodes. + 2. Create a keyspace and a table with replication factor = 3. + 3. Stop one of the nodes. + 4. Add 2 rows that have primary keys causing a hash collision. + 5. Start the offline node. + 6. Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes. + The node that's been offline will not have a value, causing a read repair. + Since difference calculation logic is using a token for it's hashmap key and the + token value is the same for both keys, this causes an incorrect diff calculation + and propagation to the node that was offline. + 7. Run the same SELECT query with ALL consistency level, forcing reading from all 3 nodes. + now there is also a conflict, since the node that was reset got an incorrect value as a + result of and prev step read repair. This incorrect value is newer than others, thus it + will be the result of reconciliation in case the diff calculation algorithm is using a + token as a key. + + """ + logger.info("Creating a new cluster") + srvs = await manager.servers_add(3) + cql, _ = await manager.get_ready_cql(srvs) + + await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3};") + await cql.run_async("CREATE TABLE ks.t (pk bigint PRIMARY KEY, c int);") + + # Stop one of the nodes. + await manager.server_stop_gracefully(srvs[0].server_id) + + # Add rows with partition kays that cause murmur3 hash collision, token value [6874760189787677834]. + pk1 = -4818441857111425024 + pk2 = -8686612841249112064 + await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk1}, 111)", consistency_level=ConsistencyLevel.ONE)) + await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk2}, 222)", consistency_level=ConsistencyLevel.ONE)) + + # Start the offline node. + await manager.server_start(srvs[0].server_id, wait_others=2) + + # Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes. + res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL)) + + # Validate the results (should be OK). + assert len(res) == 2 + for row in res: + if (row.pk == pk1): + assert row.c == 111 + elif (row.pk == pk2): + assert row.c == 222 + + res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL)) + + # Validate the results (will be wrong in case the diff calculation hash map uses tokens as keys). + assert len(res) == 2 + for row in res: + if (row.pk == pk1): + assert row.c == 111 + elif (row.pk == pk2): + assert row.c == 222