From ebaf5364490ad5dcc7138ffca31500190bfccb6f Mon Sep 17 00:00:00 2001
From: Wojciech Mitros <wojciech.mitros@scylladb.com>
Date: Tue, 28 Apr 2026 12:30:33 +0200
Subject: [PATCH] replica/database: fix cross-shard deadlock in
 lock_tables_metadata()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lock_tables_metadata() acquires a write lock on tables_metadata._cf_lock
on every shard.  It used invoke_on_all(), which dispatches lock
acquisitions to all shards in parallel via parallel_for_each +
smp::submit_to.

When two fibers call lock_tables_metadata() concurrently, this can
deadlock.  parallel_for_each starts all iterations unconditionally:
even when the local shard's lock attempt blocks (because the other
fiber already holds it), SMP messages are still sent to remote shards.
Both fibers' lock-acquisition messages land in the per-shard SMP
queues.  The SMP queue itself is FIFO, but process_incoming() drains
it and schedules each item as a reactor task via add_task(), which —
in debug and sanitize builds with SEASTAR_SHUFFLE_TASK_QUEUE — shuffles
each newly added task against all pending tasks in the same scheduling
group's reactor task queue.  This means fiber A's lock acquisition can
be reordered past fiber B's (and past unrelated tasks) on a given shard.
If fiber A wins the lock on shard X while fiber B wins on shard Y, this
creates a classic cross-shard lock-ordering deadlock (circular wait).

In production builds without SEASTAR_SHUFFLE_TASK_QUEUE, the reactor
task queue is FIFO. Still, even in release builds, the SMP queues can
reorder messages even, so the deadlock is still possible, even if it's
much less likely. In debug and sanitize builds, the task-queue shuffle
makes the deadlock very likely whenever both fibers' lock-acquisition
tasks are pending simultaneously in the reactor task queue on any shard.

This deadlock was exposed by ce00d61917 ("db: implement large_data
virtual tables with feature flag gating", merged as 88a8324e68),
which introduced legacy_drop_table_on_all_shards as a second caller
of lock_tables_metadata().  When LARGE_DATA_VIRTUAL_TABLES is enabled
during topology_state_load (via feature_service::enable), two fibers
can race:

  1. activate_large_data_virtual_tables() — calls
     legacy_drop_table_on_all_shards() which calls
     lock_tables_metadata() synchronously via .get()

  2. reload_schema_in_bg() — fires as a background fiber from
     TABLE_DIGEST_INSENSITIVE_TO_EXPIRY, eventually reaches
     schema_applier::commit() which also calls lock_tables_metadata()

If both reach lock_tables_metadata() while the lock is free on all
shards, the parallel acquisition creates the deadlock opportunity.
The deadlock blocks topology_state_load() from completing, which
prevents the bootstrapping node from finishing its topology state
transitions.  The coordinator's topology coordinator then waits for
the node to reach the expected state, but the node is stuck, so
eventually the read_barrier times out after 300 seconds.

Fix by acquiring the shard 0 lock first before attempting to
acquire any other lock. Whichever fiber wins shard 0 is
guaranteed to acquire all remaining shards before the other fiber
can proceed past shard 0, eliminating the circular-wait condition.

Tested manually with 2 approaches:
1. causing different shard locks to be acquired by different
lock_tables_metadata() calls by adding different sleeps depending
on the lock_tables_metadata() call and target shard - this reproduced
the issue consistently
2. matching the time point at which both fibers reach lock_tables_metadata()
adding a single sleep to one of the fibers - this heavily depends on
the machine so we can't create a universal reproducer this way, but
it did result in the observed failure on my machine after finding the
right sleep time

Also added a unit test for concurrent lock_tables_metadata() calls.

Fixes: SCYLLADB-1694
Fixes: SCYLLADB-1644
Fixes: SCYLLADB-1684

Closes scylladb/scylladb#29678
---
 configure.py                            |  1 +
 replica/database.cc                     | 20 +++++++++++++-
 test/boost/CMakeLists.txt               |  2 ++
 test/boost/lock_tables_metadata_test.cc | 36 +++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 test/boost/lock_tables_metadata_test.cc

diff --git a/configure.py b/configure.py
index 8f23fb688b..31cdce0d5e 100755
--- a/configure.py
+++ b/configure.py
@@ -593,6 +593,7 @@ scylla_tests = set([
     'test/boost/linearizing_input_stream_test',
     'test/boost/lister_test',
     'test/boost/locator_topology_test',
+    'test/boost/lock_tables_metadata_test',
     'test/boost/log_heap_test',
     'test/boost/logalloc_standard_allocator_segment_pool_backend_test',
     'test/boost/logalloc_test',
diff --git a/replica/database.cc b/replica/database.cc
index 5d1298a251..5bc953ec97 100644
--- a/replica/database.cc
+++ b/replica/database.cc
@@ -1328,9 +1328,27 @@ future<global_table_ptr> get_table_on_all_shards(sharded<database>& sharded_db,
 
 future<tables_metadata_lock_on_all_shards> database::lock_tables_metadata(sharded<database>& sharded_db) {
     tables_metadata_lock_on_all_shards locks;
-    co_await sharded_db.invoke_on_all([&] (auto& db) -> future<> {
+    // Acquire write lock on shard 0 first, and then on the remaining shards.
+    //
+    // Parallel acquisition on all shards could deadlock when two
+    // fibers call lock_tables_metadata() concurrently: parallel_for_each
+    // sends SMP messages to all shards even when the local shard's lock
+    // attempt blocks.  If task reordering (SEASTAR_SHUFFLE_TASK_QUEUE in
+    // debug/sanitize builds) causes fiber A to win on shard X while
+    // fiber B wins on shard Y, neither can make progress — classic
+    // cross-shard lock-ordering deadlock.
+    //
+    // Acquiring the write lock on shard 0 first, and then on the remaining
+    // shards, eliminates this: whichever fiber acquires shard 0 first is
+    // guaranteed to acquire locks on all other shards before the other fiber
+    // can acquire the lock on shard 0.
+    co_await sharded_db.invoke_on(0, [&locks, &sharded_db] (auto& db) -> future<> {
         locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
+        co_await sharded_db.invoke_on_others([&locks] (auto& db) -> future<> {
+            locks.assign_lock(co_await db.get_tables_metadata().hold_write_lock());
+        });
     });
+
     co_return locks;
 }
 
diff --git a/test/boost/CMakeLists.txt b/test/boost/CMakeLists.txt
index 4f0fa203ec..36b8789bc6 100644
--- a/test/boost/CMakeLists.txt
+++ b/test/boost/CMakeLists.txt
@@ -150,6 +150,8 @@ add_scylla_test(lister_test
   KIND SEASTAR)
 add_scylla_test(locator_topology_test
   KIND SEASTAR)
+add_scylla_test(lock_tables_metadata_test
+  KIND SEASTAR)
 add_scylla_test(log_heap_test
   KIND BOOST)
 add_scylla_test(logalloc_standard_allocator_segment_pool_backend_test
diff --git a/test/boost/lock_tables_metadata_test.cc b/test/boost/lock_tables_metadata_test.cc
new file mode 100644
index 0000000000..158a0e732d
--- /dev/null
+++ b/test/boost/lock_tables_metadata_test.cc
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2026-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
+ */
+
+#include <fmt/format.h>
+#include <seastar/core/with_timeout.hh>
+#include <seastar/testing/test_case.hh>
+#include "test/lib/cql_test_env.hh"
+
+using namespace std::chrono_literals;
+
+// Test that two lock_tables_metadata calls don't deadlock
+SEASTAR_TEST_CASE(test_lock_tables_metadata_deadlock) {
+    return do_with_cql_env_thread([](cql_test_env& e) {
+        try {
+            // Repeat the test scenario to increase the chance of hitting the deadlock.
+            // If no deadlock occurs, each repetition should complete within a fraction of a second,
+            // so even with 100 repetitions, the total test time should be reasonable.
+            for (int i = 0; i < 100; ++i) {
+                with_timeout(lowres_clock::now() + 30s,
+                    when_all_succeed(
+                        e.local_db().lock_tables_metadata(e.db()).discard_result(),
+                        e.local_db().lock_tables_metadata(e.db()).discard_result()
+                    )).get();
+            }
+        } catch (seastar::timed_out_error&) {
+            fmt::print(stderr, "FAIL: lock_tables_metadata deadlocked (timed out after 30s)\n");
+            _exit(1);
+        }
+    });
+}
+