test: Add test case for checking read repair diff calculation when having

conflicting keys.

The test updates two rows with keys that result in a Murmur3 hash collision, which
is used to generate Scylla tokens. These tokens are involved in read repair diff
calculations. Due to the identical token values, a hash map key collision occurs.
Consequently, an incorrect value from the second row (with a different primary key)
is then sent for writing as 'repaired', causing data corruption.

(cherry picked from commit e577f1d141)
This commit is contained in:
Sergey Zolotukhin
2024-12-18 15:50:03 +01:00
committed by GitHub Action
parent 52a09a2f2d
commit 63d58022a6

View File

@@ -0,0 +1,82 @@
#
# Copyright (C) 2024-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import logging
import pytest
import asyncio
import time
from cassandra import ConsistencyLevel # type: ignore
from cassandra.query import SimpleStatement # type: ignore
from test.pylib.manager_client import ManagerClient
from test.pylib.util import wait_for_cql_and_get_hosts
logger = logging.getLogger(__name__)
@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/19101")
@pytest.mark.asyncio
async def test_read_repair_with_conflicting_hash_keys(request: pytest.FixtureRequest, manager: ManagerClient) -> None:
"""
Test that conflicting hash keys are handled correctly during read repair.
Issue https://github.com/scylladb/scylladb/issues/19101
1. Create a new cluster with 3 nodes.
2. Create a keyspace and a table with replication factor = 3.
3. Stop one of the nodes.
4. Add 2 rows that have primary keys causing a hash collision.
5. Start the offline node.
6. Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes.
The node that's been offline will not have a value, causing a read repair.
Since difference calculation logic is using a token for it's hashmap key and the
token value is the same for both keys, this causes an incorrect diff calculation
and propagation to the node that was offline.
7. Run the same SELECT query with ALL consistency level, forcing reading from all 3 nodes.
now there is also a conflict, since the node that was reset got an incorrect value as a
result of and prev step read repair. This incorrect value is newer than others, thus it
will be the result of reconciliation in case the diff calculation algorithm is using a
token as a key.
"""
logger.info("Creating a new cluster")
srvs = await manager.servers_add(3)
cql, _ = await manager.get_ready_cql(srvs)
await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3};")
await cql.run_async("CREATE TABLE ks.t (pk bigint PRIMARY KEY, c int);")
# Stop one of the nodes.
await manager.server_stop_gracefully(srvs[0].server_id)
# Add rows with partition kays that cause murmur3 hash collision, token value [6874760189787677834].
pk1 = -4818441857111425024
pk2 = -8686612841249112064
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk1}, 111)", consistency_level=ConsistencyLevel.ONE))
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk2}, 222)", consistency_level=ConsistencyLevel.ONE))
# Start the offline node.
await manager.server_start(srvs[0].server_id, wait_others=2)
# Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes.
res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL))
# Validate the results (should be OK).
assert len(res) == 2
for row in res:
if (row.pk == pk1):
assert row.c == 111
elif (row.pk == pk2):
assert row.c == 222
res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL))
# Validate the results (will be wrong in case the diff calculation hash map uses tokens as keys).
assert len(res) == 2
for row in res:
if (row.pk == pk1):
assert row.c == 111
elif (row.pk == pk2):
assert row.c == 222