mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-06 06:53:12 +00:00
test: Add test case for checking read repair diff calculation when having
conflicting keys.
The test updates two rows with keys that result in a Murmur3 hash collision, which
is used to generate Scylla tokens. These tokens are involved in read repair diff
calculations. Due to the identical token values, a hash map key collision occurs.
Consequently, an incorrect value from the second row (with a different primary key)
is then sent for writing as 'repaired', causing data corruption.
(cherry picked from commit e577f1d141)
This commit is contained in:
committed by
GitHub Action
parent
52a09a2f2d
commit
63d58022a6
82
test/topology_custom/test_conflicting_keys_read_repair.py
Normal file
82
test/topology_custom/test_conflicting_keys_read_repair.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#
|
||||
# Copyright (C) 2024-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
#
|
||||
import logging
|
||||
import pytest
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from cassandra import ConsistencyLevel # type: ignore
|
||||
from cassandra.query import SimpleStatement # type: ignore
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.util import wait_for_cql_and_get_hosts
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/19101")
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_repair_with_conflicting_hash_keys(request: pytest.FixtureRequest, manager: ManagerClient) -> None:
|
||||
"""
|
||||
Test that conflicting hash keys are handled correctly during read repair.
|
||||
Issue https://github.com/scylladb/scylladb/issues/19101
|
||||
|
||||
1. Create a new cluster with 3 nodes.
|
||||
2. Create a keyspace and a table with replication factor = 3.
|
||||
3. Stop one of the nodes.
|
||||
4. Add 2 rows that have primary keys causing a hash collision.
|
||||
5. Start the offline node.
|
||||
6. Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes.
|
||||
The node that's been offline will not have a value, causing a read repair.
|
||||
Since difference calculation logic is using a token for it's hashmap key and the
|
||||
token value is the same for both keys, this causes an incorrect diff calculation
|
||||
and propagation to the node that was offline.
|
||||
7. Run the same SELECT query with ALL consistency level, forcing reading from all 3 nodes.
|
||||
now there is also a conflict, since the node that was reset got an incorrect value as a
|
||||
result of and prev step read repair. This incorrect value is newer than others, thus it
|
||||
will be the result of reconciliation in case the diff calculation algorithm is using a
|
||||
token as a key.
|
||||
|
||||
"""
|
||||
logger.info("Creating a new cluster")
|
||||
srvs = await manager.servers_add(3)
|
||||
cql, _ = await manager.get_ready_cql(srvs)
|
||||
|
||||
await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3};")
|
||||
await cql.run_async("CREATE TABLE ks.t (pk bigint PRIMARY KEY, c int);")
|
||||
|
||||
# Stop one of the nodes.
|
||||
await manager.server_stop_gracefully(srvs[0].server_id)
|
||||
|
||||
# Add rows with partition kays that cause murmur3 hash collision, token value [6874760189787677834].
|
||||
pk1 = -4818441857111425024
|
||||
pk2 = -8686612841249112064
|
||||
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk1}, 111)", consistency_level=ConsistencyLevel.ONE))
|
||||
await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk2}, 222)", consistency_level=ConsistencyLevel.ONE))
|
||||
|
||||
# Start the offline node.
|
||||
await manager.server_start(srvs[0].server_id, wait_others=2)
|
||||
|
||||
# Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes.
|
||||
res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL))
|
||||
|
||||
# Validate the results (should be OK).
|
||||
assert len(res) == 2
|
||||
for row in res:
|
||||
if (row.pk == pk1):
|
||||
assert row.c == 111
|
||||
elif (row.pk == pk2):
|
||||
assert row.c == 222
|
||||
|
||||
res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL))
|
||||
|
||||
# Validate the results (will be wrong in case the diff calculation hash map uses tokens as keys).
|
||||
assert len(res) == 2
|
||||
for row in res:
|
||||
if (row.pk == pk1):
|
||||
assert row.c == 111
|
||||
elif (row.pk == pk2):
|
||||
assert row.c == 222
|
||||
Reference in New Issue
Block a user