Merge '[Backport 6.0] Fail bootstrap if ip mapping is missing during double write stage' from ScyllaDB

If a node restart just before it stores bootstrapping node's IP it will
not have ID to IP mapping for bootstrapping node which may cause failure
on a write path. Detect this and fail bootstrapping if it happens.

(cherry picked from commit 1faef47952)

(cherry picked from commit 27445f5291)

(cherry picked from commit 6853b02c00)

(cherry picked from commit f91db0c1e4)

 Refs #18927

Closes scylladb/scylladb#19118

* github.com:scylladb/scylladb:
  raft topology: fix indentation after previous commit
  raft topology: do not add bootstrapping node without IP as pending
  test: add test of bootstrap where the coordinator crashes just before storing IP mapping
  schema_tables: remove unused code
This commit is contained in:
Kamil Braun
2024-06-06 11:35:13 +02:00
3 changed files with 86 additions and 16 deletions

View File

@@ -0,0 +1,55 @@
# Copyright (C) 2024-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
import asyncio
from test.pylib.manager_client import ManagerClient
import pytest
import logging
from test.pylib.rest_client import inject_error_one_shot
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_broken_bootstrap(manager: ManagerClient):
server_a = await manager.server_add()
server_b = await manager.server_add(start=False)
await manager.cql.run_async("CREATE KEYSPACE test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}")
await manager.cql.run_async("CREATE TABLE test.test (a int PRIMARY KEY, b int)")
for i in range(100):
await manager.cql.run_async(f"INSERT INTO test.test (a, b) VALUES ({i}, {i})")
await inject_error_one_shot(manager.api, server_a.ip_addr, "crash-before-bootstrapping-node-added")
try:
# Timeout fast since we do not expect the operation to complete
# because the coordinator is dead by now due to the error injection
# above
await manager.server_start(server_b.server_id, timeout=5)
pytest.fail("Expected server_add to fail")
except Exception:
pass
await manager.server_stop(server_b.server_id)
await manager.server_stop(server_a.server_id)
stop_event = asyncio.Event()
async def worker():
logger.info("Worker started")
while not stop_event.is_set():
for i in range(100):
await manager.cql.run_async(f"INSERT INTO test.test (a, b) VALUES ({i}, {i})")
response = await manager.cql.run_async(f"SELECT * FROM test.test WHERE a = {i}")
assert response[0].b == i
await asyncio.sleep(0.1)
logger.info("Worker stopped")
await manager.server_start(server_a.server_id)
await manager.driver_connect()
worker_task = asyncio.create_task(worker())
await asyncio.sleep(20)
stop_event.set()
await worker_task