mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-20 08:30:35 +00:00
When calling a migration notification from the context of a notification
callback, this could lead to a deadlock with unregistering a listener:
A: the parent notification is called. it calls thread_for_each, where it
acquires a read lock on the vector of listeners, and calls the
callback function for each listener while holding the lock.
B: a listener is unregistered. it calls `remove` and tries to acquire a
write lock on the vector of listeners. it waits because the lock is
held.
A: the callback function calls another notification and calls
thread_for_each which tries to acquire the read lock again. but it
waits since there is a waiter.
Currently we have such concrete scenario when creating a table, where
the callback of `before_create_column_family` in the tablet allocator
calls `before_allocate_tablet_map`, and this could deadlock with node
shutdown where we unregister listeners.
Fix this by not acquiring the read lock again in the nested
notification. There is no need because the read lock is already held by
the parent notification while the child notification is running. We add
a function `thread_for_each_nested` that is similar to `thread_for_each`
except it assumes the read lock is already held and doesn't acquire it,
and it should be used for nested notifications instead of
`thread_for_each`.
Fixes scylladb/scylladb#27364
Closes scylladb/scylladb#27637
(cherry picked from commit 55f4a2b754)
Closes scylladb/scylladb#28557
47 lines
2.0 KiB
Python
47 lines
2.0 KiB
Python
#
|
|
# Copyright (C) 2025-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
import asyncio
|
|
import pytest
|
|
from test.cluster.conftest import skip_mode
|
|
from test.pylib.manager_client import ManagerClient
|
|
from test.cluster.util import new_test_keyspace, reconnect_driver
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@skip_mode('release', "error injections aren't enabled in release mode")
|
|
async def test_create_table_notification_deadlock_with_shutdown(manager: ManagerClient):
|
|
"""
|
|
Execute a CREATE TABLE query during node shutdown and reproduce a deadlock between
|
|
the create table notification and unregistering listeners.
|
|
Reproduces scylladb/scylladb#27364
|
|
"""
|
|
server = await manager.server_add()
|
|
cql = manager.get_cql()
|
|
async with new_test_keyspace(manager, "") as ks:
|
|
pause_in_notif_injection = "pause_in_allocate_tablets_for_new_table"
|
|
await manager.api.enable_injection(server.ip_addr, pause_in_notif_injection, one_shot=True)
|
|
|
|
# Start creating the table asynchronously. it will wait at the injection point during the notification.
|
|
cql.run_async(f"CREATE TABLE {ks}.t (pk int primary key, v int)")
|
|
|
|
log = await manager.server_open_log(server.server_id)
|
|
mark = await log.mark()
|
|
|
|
# Start shutting down the node. it will wait while unregistering a listener because there is
|
|
# a notification running that holds the lock of the migration listeners vector.
|
|
stop_task = asyncio.create_task(manager.server_stop_gracefully(server.server_id))
|
|
await log.wait_for('Shutting down native transport server', timeout=60, from_mark=mark)
|
|
await asyncio.sleep(1)
|
|
|
|
# Now continue to run the nested notification. Since there is a waiter, it may deadlock when
|
|
# reading the migration listeners vector.
|
|
await manager.api.message_injection(server.ip_addr, pause_in_notif_injection)
|
|
await stop_task
|
|
|
|
# reconnect for dropping the keyspace
|
|
await manager.server_start(server.server_id)
|
|
await reconnect_driver(manager)
|