From 801bf42ea2622355fee799bc1e461e635e3f4d21 Mon Sep 17 00:00:00 2001
From: Petr Gusev <petr.gusev@scylladb.com>
Date: Tue, 24 Jun 2025 17:45:42 +0200
Subject: [PATCH] sharder: add try_get_shard_for_reads method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, we use storage_proxy/get_cas_shard ->
sharder.shard_for_reads to decide which shard to use for LWT code
execution on both replicas and the coordinator.

If the coordinator is not a replica, shard_for_reads returns 0 —
the 'default' shard. This behavior has at least two problems:
* Shard 0 may become overloaded, because all LWT coordinators that are
not replicas will be served on it.
* The zero shard does not match shard_for_reads on replicas, which
hinders the "same shard for client and server" RPC-level optimization.

To fix this, we need to know whether the current node hosts a replica
for the tablet corresponding to the given token. Currently, there is
no API we could use for this. For historical reasons,
sharder::shard_for_reads returns 0 when the node does not host the
shard, which leads to ambiguity.

This commit introduces try_get_shard_for_reads, which returns a
disengaged std::optional when the tablet is not present on
the local node.

We leave shard_for_reads method in the base sharder class, it calls
try_get_shard_for_reads and returns zero by default. We need to rename
tablet_sharder private methods shard_for_reads and shard_for_writes
so that they don't conflict with the sharder::shard_for_reads.
---
 dht/auto_refreshing_sharder.hh |  2 +-
 dht/i_partitioner.cc           |  8 ++++----
 dht/token-sharding.hh          | 15 +++++++++++++--
 locator/tablet_sharder.hh      | 15 ++++++---------
 4 files changed, 24 insertions(+), 16 deletions(-)
diff --git a/dht/auto_refreshing_sharder.hh b/dht/auto_refreshing_sharder.hh
index 7ad6e6ed22..de74449f5b 100644
--- a/dht/auto_refreshing_sharder.hh
+++ b/dht/auto_refreshing_sharder.hh
@@ -41,7 +41,7 @@ public:
 
     virtual ~auto_refreshing_sharder();
 
-    virtual unsigned shard_for_reads(const token& t) const override;
+    virtual std::optional<unsigned> try_get_shard_for_reads(const token& t) const override;
 
     virtual dht::shard_replica_set shard_for_writes(const token& t, std::optional<write_replica_set_selector> sel) const override;
 
diff --git a/dht/i_partitioner.cc b/dht/i_partitioner.cc
index 203f61a0f3..1a03af4fa7 100644
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -43,8 +43,8 @@ static_sharder::shard_of(const token& t) const {
     return dht::shard_of(_shard_count, _sharding_ignore_msb_bits, t);
 }
 
-unsigned
-static_sharder::shard_for_reads(const token& t) const {
+std::optional<unsigned>
+static_sharder::try_get_shard_for_reads(const token& t) const {
     return shard_of(t);
 }
 
@@ -513,8 +513,8 @@ auto_refreshing_sharder::refresh() {
     });
 }
 
-unsigned auto_refreshing_sharder::shard_for_reads(const token& t) const {
-    return _sharder->shard_for_reads(t);
+std::optional<unsigned> auto_refreshing_sharder::try_get_shard_for_reads(const token& t) const {
+    return _sharder->try_get_shard_for_reads(t);
 }
 
 dht::shard_replica_set
diff --git a/dht/token-sharding.hh b/dht/token-sharding.hh
index 3cb4a08648..86aa1b8572 100644
--- a/dht/token-sharding.hh
+++ b/dht/token-sharding.hh
@@ -54,6 +54,12 @@ public:
     sharder(unsigned shard_count = smp::count, unsigned sharding_ignore_msb_bits = 0);
     virtual ~sharder() = default;
 
+    /**
+    * Returns the shard that handles a particular token for reads, or empty if this
+    * node doesn't contain data for it.
+    */
+    virtual std::optional<unsigned> try_get_shard_for_reads(const token& t) const = 0;
+
     /**
      * Returns the shard that handles a particular token for reads.
      * Use shard_for_writes() to determine the set of shards that should receive writes.
@@ -67,7 +73,12 @@ public:
      *   }
      *
      */
-    virtual unsigned shard_for_reads(const token& t) const = 0;
+    unsigned shard_for_reads(const token& t) const {
+        // FIXME: Consider throwing when there is no owning shard on the current host rather than returning 0.
+        // It's a coordination mistake to route requests to non-owners. Topology coordinator should synchronize
+        // with request coordinators before moving the shard away.
+        return try_get_shard_for_reads(t).value_or(0);
+    }
 
     /**
      * Returns the set of shards which should receive a write to token t.
@@ -134,7 +145,7 @@ public:
     virtual std::optional<shard_and_token> next_shard(const token& t) const;
     virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans = 1) const;
 
-    virtual unsigned shard_for_reads(const token& t) const override;
+    virtual std::optional<unsigned> try_get_shard_for_reads(const token& t) const override;
     virtual shard_replica_set shard_for_writes(const token& t, std::optional<write_replica_set_selector> sel) const override;
     virtual token token_for_next_shard_for_reads(const token& t, shard_id shard, unsigned spans = 1) const override;
     virtual std::optional<shard_and_token> next_shard_for_reads(const token& t) const override;
diff --git a/locator/tablet_sharder.hh b/locator/tablet_sharder.hh
index 9b52a8fb29..38a43b7859 100644
--- a/locator/tablet_sharder.hh
+++ b/locator/tablet_sharder.hh
@@ -40,7 +40,7 @@ private:
         return std::nullopt;
     };
 
-    dht::shard_replica_set shard_for_writes(tablet_id tid, host_id host, std::optional<write_replica_set_selector> sel = std::nullopt) const {
+    dht::shard_replica_set choose_shard_for_writes(tablet_id tid, host_id host, std::optional<write_replica_set_selector> sel = std::nullopt) const {
         auto* trinfo = _tmap->get_tablet_transition_info(tid);
         auto& tinfo = _tmap->get_tablet_info(tid);
         dht::shard_replica_set shards;
@@ -78,7 +78,7 @@ private:
         return shards;
     }
 
-    std::optional<shard_id> shard_for_reads(tablet_id tid, host_id host) const {
+    std::optional<shard_id> choose_shard_for_reads(tablet_id tid, host_id host) const {
         ensure_tablet_map();
         auto* trinfo = _tmap->get_tablet_transition_info(tid);
         auto& tinfo = _tmap->get_tablet_info(tid);
@@ -107,13 +107,10 @@ public:
 
     virtual ~tablet_sharder() = default;
 
-    virtual unsigned shard_for_reads(const token& t) const override {
+    virtual std::optional<unsigned> try_get_shard_for_reads(const token& t) const override {
         ensure_tablet_map();
         auto tid = _tmap->get_tablet_id(t);
-        // FIXME: Consider throwing when there is no owning shard on the current host rather than returning 0.
-        // It's a coordination mistake to route requests to non-owners. Topology coordinator should synchronize
-        // with request coordinators before moving the shard away.
-        auto shard = shard_for_reads(tid, _host).value_or(0);
+        auto shard = choose_shard_for_reads(tid, _host);
         tablet_logger.trace("[{}] shard_of({}) = {}, tablet={}", _table, t, shard, tid);
         return shard;
     }
@@ -121,7 +118,7 @@ public:
     virtual dht::shard_replica_set shard_for_writes(const token& t, std::optional<write_replica_set_selector> sel = std::nullopt) const override {
         ensure_tablet_map();
         auto tid = _tmap->get_tablet_id(t);
-        auto shards = shard_for_writes(tid, _host, sel);
+        auto shards = choose_shard_for_writes(tid, _host, sel);
         tablet_logger.trace("[{}] shard_for_writes({}) = {}, tablet={}", _table, t, shards, tid);
         return shards;
     }
@@ -130,7 +127,7 @@ public:
         ensure_tablet_map();
         std::optional<tablet_id> tb = _tmap->get_tablet_id(t);
         while ((tb = _tmap->next_tablet(*tb))) {
-            auto r = shard_for_reads(*tb, _host);
+            auto r = choose_shard_for_reads(*tb, _host);
             auto next = _tmap->get_first_token(*tb);
             tablet_logger.trace("[{}] token_for_next_shard({}) = {{{}, {}}}, tablet={}", _table, t, next, r, *tb);
             return dht::shard_and_token{r.value_or(0), next};