mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-23 00:02:37 +00:00
This patch continues the effort to split the huge executor.cc (5000 lines before this patch) even more. In this patch we introduce a new source file, executor_util.cc, for various utility functions that are used for many different operations and therefore are useful to have in a header file. These utility functions will now be in executor_util.cc and executor_util.hh - instead of executor.cc and executor.hh. Various source files, including executor.cc, the executor_read.cc introduced in the previous patch, as well as older source files like as streams.cc, ttl.cc and serialization.cc, use the new header file. This patch removes over 700 lines of code from executor.cc, and also removes a large amount of utility functions declerations from executor.hh. Originally, executor.hh was meant to be about the interface that the Alternator server needs to *execute* the different DynamoDB API operations - and after this patch it returns closer to this original goal. Signed-off-by: Nadav Har'El <nyh@scylladb.com>
939 lines
46 KiB
C++
939 lines
46 KiB
C++
/*
|
|
* Copyright 2021-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
|
|
#include <chrono>
|
|
#include <cstdint>
|
|
#include <exception>
|
|
#include <optional>
|
|
#include <seastar/core/sstring.hh>
|
|
#include <seastar/core/coroutine.hh>
|
|
#include <seastar/core/sleep.hh>
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/lowres_clock.hh>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
|
|
#include "cdc/log.hh"
|
|
#include "exceptions/exceptions.hh"
|
|
#include "gms/gossiper.hh"
|
|
#include "gms/inet_address.hh"
|
|
#include "inet_address_vectors.hh"
|
|
#include "locator/abstract_replication_strategy.hh"
|
|
#include "utils/log.hh"
|
|
#include "gc_clock.hh"
|
|
#include "replica/database.hh"
|
|
#include "service/client_state.hh"
|
|
#include "service_permit.hh"
|
|
#include "mutation/timestamp.hh"
|
|
#include "service/storage_proxy.hh"
|
|
#include "service/pager/paging_state.hh"
|
|
#include "service/pager/query_pagers.hh"
|
|
#include "gms/feature_service.hh"
|
|
#include "mutation/mutation.hh"
|
|
#include "types/types.hh"
|
|
#include "types/map.hh"
|
|
#include "utils/assert.hh"
|
|
#include "utils/rjson.hh"
|
|
#include "utils/big_decimal.hh"
|
|
#include "cql3/selection/selection.hh"
|
|
#include "cql3/values.hh"
|
|
#include "cql3/query_options.hh"
|
|
#include "cql3/column_identifier.hh"
|
|
#include "alternator/executor.hh"
|
|
#include "alternator/executor_util.hh"
|
|
#include "alternator/controller.hh"
|
|
#include "alternator/serialization.hh"
|
|
#include "alternator/ttl_tag.hh"
|
|
#include "dht/sharder.hh"
|
|
#include "db/config.hh"
|
|
#include "db/tags/utils.hh"
|
|
#include "utils/labels.hh"
|
|
|
|
#include "ttl.hh"
|
|
|
|
static logging::logger tlogger("alternator_ttl");
|
|
|
|
namespace alternator {
|
|
|
|
future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
|
_stats.api_operations.update_time_to_live++;
|
|
if (!_proxy.features().alternator_ttl) {
|
|
co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Upgrade all nodes to a version that supports it.");
|
|
}
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
|
|
maybe_audit(audit_info, audit::statement_category::DDL,
|
|
schema->ks_name(), schema->cf_name(), "UpdateTimeToLive", request);
|
|
|
|
rjson::value* spec = rjson::find(request, "TimeToLiveSpecification");
|
|
if (!spec || !spec->IsObject()) {
|
|
co_return api_error::validation("UpdateTimeToLive missing mandatory TimeToLiveSpecification");
|
|
}
|
|
const rjson::value* v = rjson::find(*spec, "Enabled");
|
|
if (!v || !v->IsBool()) {
|
|
co_return api_error::validation("UpdateTimeToLive requires boolean Enabled");
|
|
}
|
|
bool enabled = v->GetBool();
|
|
v = rjson::find(*spec, "AttributeName");
|
|
if (!v || !v->IsString()) {
|
|
co_return api_error::validation("UpdateTimeToLive requires string AttributeName");
|
|
}
|
|
// Although the DynamoDB documentation specifies that attribute names
|
|
// should be between 1 and 64K bytes, in practice, it only allows
|
|
// between 1 and 255 bytes. There are no other limitations on which
|
|
// characters are allowed in the name.
|
|
if (v->GetStringLength() < 1 || v->GetStringLength() > 255) {
|
|
co_return api_error::validation("The length of AttributeName must be between 1 and 255");
|
|
}
|
|
sstring attribute_name = rjson::to_sstring(*v);
|
|
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
|
|
co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [&](std::map<sstring, sstring>& tags_map) {
|
|
if (enabled) {
|
|
if (tags_map.contains(TTL_TAG_KEY)) {
|
|
throw api_error::validation("TTL is already enabled");
|
|
}
|
|
tags_map[TTL_TAG_KEY] = attribute_name;
|
|
} else {
|
|
auto i = tags_map.find(TTL_TAG_KEY);
|
|
if (i == tags_map.end()) {
|
|
throw api_error::validation("TTL is already disabled");
|
|
} else if (i->second != attribute_name) {
|
|
throw api_error::validation(format(
|
|
"Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
|
|
attribute_name, i->second));
|
|
}
|
|
tags_map.erase(TTL_TAG_KEY);
|
|
}
|
|
});
|
|
|
|
// Prepare the response, which contains a TimeToLiveSpecification
|
|
// basically identical to the request's
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TimeToLiveSpecification", std::move(*spec));
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr<audit::audit_info_alternator>& audit_info) {
|
|
_stats.api_operations.describe_time_to_live++;
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
|
|
maybe_audit(audit_info, audit::statement_category::QUERY,
|
|
schema->ks_name(), schema->cf_name(), "DescribeTimeToLive", request);
|
|
|
|
std::map<sstring, sstring> tags_map = get_tags_of_table_or_throw(schema);
|
|
rjson::value desc = rjson::empty_object();
|
|
auto i = tags_map.find(TTL_TAG_KEY);
|
|
if (i == tags_map.end()) {
|
|
rjson::add(desc, "TimeToLiveStatus", "DISABLED");
|
|
} else {
|
|
rjson::add(desc, "TimeToLiveStatus", "ENABLED");
|
|
rjson::add(desc, "AttributeName", rjson::from_string(i->second));
|
|
}
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TimeToLiveDescription", std::move(desc));
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
// expiration_service is a sharded service responsible for cleaning up expired
|
|
// items in all tables with per-item expiration enabled. Currently, this means
|
|
// Alternator tables with TTL configured via an UpdateTimeToLive request.
|
|
//
|
|
// Here is a brief overview of how the expiration service works:
|
|
//
|
|
// An expiration thread on each shard periodically scans the items (i.e.,
|
|
// rows) owned by this shard, looking for items whose chosen expiration-time
|
|
// attribute indicates they are expired, and deletes those items.
|
|
// The expiration-time "attribute" can be either an actual Scylla column
|
|
// (must be numeric) or an Alternator "attribute" - i.e., an element in
|
|
// the ATTRS_COLUMN_NAME map<utf8,bytes> column where the numeric expiration
|
|
// time is encoded in DynamoDB's JSON encoding inside the bytes value.
|
|
// To avoid scanning the same items RF times in RF replicas, only one node is
|
|
// responsible for scanning a token range at a time. Normally, this is the
|
|
// node owning this range as a "primary range" (the first node in the ring
|
|
// with this range), but when this node is down, the secondary owner (the
|
|
// second in the ring) may take over.
|
|
// An expiration thread is responsible for all tables which need expiration
|
|
// scans. Currently, the different tables are scanned sequentially (not in
|
|
// parallel).
|
|
// The expiration thread scans item using CL=QUORUM to ensures that it reads
|
|
// a consistent expiration-time attribute. This means that the items are read
|
|
// locally and in addition QUORUM-1 additional nodes (one additional node
|
|
// when RF=3) need to read the data and send digests.
|
|
// When the expiration thread decides that an item has expired and wants
|
|
// to delete it, it does it using a CL=QUORUM write. This allows this
|
|
// deletion to be visible for consistent (quorum) reads. The deletion,
|
|
// like user deletions, will also appear on the CDC log and therefore
|
|
// Alternator Streams if enabled - currently as ordinary deletes (the
|
|
// userIdentity flag is currently missing this is issue #11523).
|
|
expiration_service::expiration_service(data_dictionary::database db, service::storage_proxy& proxy, gms::gossiper& g)
|
|
: _db(db)
|
|
, _proxy(proxy)
|
|
, _gossiper(g)
|
|
{
|
|
}
|
|
|
|
// Convert the big_decimal used to represent expiration time to an integer.
|
|
// Any fractional part is dropped. If the number is negative or invalid,
|
|
// 0 is returned, and if it's too high, the maximum unsigned long is returned.
|
|
static unsigned long bigdecimal_to_ul(const big_decimal& bd) {
|
|
// The big_decimal format has an integer mantissa of arbitrary length
|
|
// "unscaled_value" and then a (power of 10) exponent "scale".
|
|
if (bd.unscaled_value() <= 0) {
|
|
return 0;
|
|
}
|
|
if (bd.scale() == 0) {
|
|
// The fast path, when the expiration time is an integer, scale==0.
|
|
return static_cast<unsigned long>(bd.unscaled_value());
|
|
}
|
|
// Because the mantissa can be of arbitrary length, we work on it
|
|
// as a string. TODO: find a less ugly algorithm.
|
|
auto str = bd.unscaled_value().str();
|
|
if (bd.scale() > 0) {
|
|
int len = str.length();
|
|
if (len < bd.scale()) {
|
|
return 0;
|
|
}
|
|
str = str.substr(0, len-bd.scale());
|
|
} else {
|
|
if (bd.scale() < -20) {
|
|
return std::numeric_limits<unsigned long>::max();
|
|
}
|
|
for (int i = 0; i < -bd.scale(); i++) {
|
|
str.push_back('0');
|
|
}
|
|
}
|
|
// strtoul() returns ULONG_MAX if the number is too large, or 0 if not
|
|
// a number.
|
|
return strtoul(str.c_str(), nullptr, 10);
|
|
}
|
|
|
|
// The following is_expired() functions all check if an item with the given
|
|
// expiration time has expired, according to the DynamoDB API rules.
|
|
// The rules are:
|
|
// 1. If the expiration time attribute's value is not a number type,
|
|
// the item is not expired.
|
|
// 2. The expiration time is measured in seconds since the UNIX epoch.
|
|
// 3. If the expiration time is more than 5 years in the past, it is assumed
|
|
// to be malformed and ignored - and the item does not expire.
|
|
static bool is_expired(gc_clock::time_point expiration_time, gc_clock::time_point now) {
|
|
return expiration_time <= now &&
|
|
expiration_time > now - std::chrono::years(5);
|
|
}
|
|
|
|
static bool is_expired(const big_decimal& expiration_time, gc_clock::time_point now) {
|
|
unsigned long t = bigdecimal_to_ul(expiration_time);
|
|
// We assume - and the assumption turns out to be correct - that the
|
|
// epoch of gc_clock::time_point and the one used by the DynamoDB protocol
|
|
// are the same (the UNIX epoch in UTC). The resolution (seconds) is also
|
|
// the same.
|
|
return is_expired(gc_clock::time_point(gc_clock::duration(std::chrono::seconds(t))), now);
|
|
}
|
|
static bool is_expired(const rjson::value& expiration_time, gc_clock::time_point now) {
|
|
std::optional<big_decimal> n = try_unwrap_number(expiration_time);
|
|
return n && is_expired(*n, now);
|
|
}
|
|
|
|
// expire_item() expires an item - i.e., deletes it as appropriate for
|
|
// expiration - with CL=QUORUM and (FIXME!) in a way Alternator Streams
|
|
// understands it is an expiration event - not a user-initiated deletion.
|
|
static future<> expire_item(service::storage_proxy& proxy,
|
|
const service::query_state& qs,
|
|
const std::vector<managed_bytes_opt>& row,
|
|
schema_ptr schema,
|
|
api::timestamp_type ts) {
|
|
// Prepare the row key to delete
|
|
// NOTICE: the order of columns is guaranteed by the fact that selection::wildcard
|
|
// is used, which indicates that columns appear in the order defined by
|
|
// schema::all_columns_in_select_order() - partition key columns goes first,
|
|
// immediately followed by clustering key columns
|
|
std::vector<bytes> exploded_pk;
|
|
const unsigned pk_size = schema->partition_key_size();
|
|
const unsigned ck_size = schema->clustering_key_size();
|
|
for (unsigned c = 0; c < pk_size; ++c) {
|
|
const auto& row_c = row[c];
|
|
if (!row_c) {
|
|
// This shouldn't happen - all key columns must have values.
|
|
// But if it ever happens, let's just *not* expire the item.
|
|
// FIXME: log or increment a metric if this happens.
|
|
return make_ready_future<>();
|
|
}
|
|
exploded_pk.push_back(to_bytes(*row_c));
|
|
}
|
|
auto pk = partition_key::from_exploded(exploded_pk);
|
|
mutation m(schema, pk);
|
|
// If there's no clustering key, a tombstone should be created directly
|
|
// on a partition, not on a clustering row - otherwise it will look like
|
|
// an open-ended range tombstone, which will crash on KA/LA sstable format.
|
|
// See issue #6035
|
|
if (ck_size == 0) {
|
|
m.partition().apply(tombstone(ts, gc_clock::now()));
|
|
} else {
|
|
std::vector<bytes> exploded_ck;
|
|
for (unsigned c = pk_size; c < pk_size + ck_size; ++c) {
|
|
const auto& row_c = row[c];
|
|
if (!row_c) {
|
|
// This shouldn't happen - all key columns must have values.
|
|
// But if it ever happens, let's just *not* expire the item.
|
|
// FIXME: log or increment a metric if this happens.
|
|
return make_ready_future<>();
|
|
}
|
|
exploded_ck.push_back(to_bytes(*row_c));
|
|
}
|
|
auto ck = clustering_key::from_exploded(exploded_ck);
|
|
m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
|
|
}
|
|
utils::chunked_vector<mutation> mutations;
|
|
mutations.push_back(std::move(m));
|
|
return proxy.mutate(std::move(mutations),
|
|
db::consistency_level::LOCAL_QUORUM,
|
|
executor::default_timeout(), // FIXME - which timeout?
|
|
qs.get_trace_state(), qs.get_permit(),
|
|
db::allow_per_partition_rate_limit::no,
|
|
false,
|
|
cdc::per_request_options{
|
|
.is_system_originated = true,
|
|
}
|
|
);
|
|
}
|
|
|
|
static size_t random_offset(size_t min, size_t max) {
|
|
static thread_local std::default_random_engine re{std::random_device{}()};
|
|
std::uniform_int_distribution<size_t> dist(min, max);
|
|
return dist(re);
|
|
}
|
|
|
|
// Get a list of secondary token ranges for the given node, and the primary
|
|
// node responsible for each of these token ranges.
|
|
// A "secondary range" is a range of tokens where for each token, the second
|
|
// node (in ring order) out of the RF replicas that hold this token is the
|
|
// given node.
|
|
// In the expiration scanner, we want to scan a secondary range but only if
|
|
// this range's primary node is down. For this we need to return not just
|
|
// a list of this node's secondary ranges - but also the primary owner of
|
|
// each of those ranges.
|
|
//
|
|
// The function is to be used with vnodes only
|
|
static future<std::vector<std::pair<dht::token_range, locator::host_id>>> get_secondary_ranges(
|
|
const locator::effective_replication_map* erm,
|
|
locator::host_id ep) {
|
|
const auto& tm = *erm->get_token_metadata_ptr();
|
|
const auto& sorted_tokens = tm.sorted_tokens();
|
|
std::vector<std::pair<dht::token_range, locator::host_id>> ret;
|
|
throwing_assert(!sorted_tokens.empty());
|
|
auto prev_tok = sorted_tokens.back();
|
|
for (const auto& tok : sorted_tokens) {
|
|
co_await coroutine::maybe_yield();
|
|
// FIXME: pass is_vnode=true to get_natural_replicas since the token is in tm.sorted_tokens()
|
|
host_id_vector_replica_set eps = erm->get_natural_replicas(tok);
|
|
if (eps.size() <= 1 || eps[1] != ep) {
|
|
prev_tok = tok;
|
|
continue;
|
|
}
|
|
// Add the range (prev_tok, tok] to ret. However, if the range wraps
|
|
// around, split it to two non-wrapping ranges.
|
|
if (prev_tok < tok) {
|
|
ret.emplace_back(
|
|
dht::token_range{
|
|
dht::token_range::bound(prev_tok, false),
|
|
dht::token_range::bound(tok, true)},
|
|
eps[0]);
|
|
} else {
|
|
ret.emplace_back(
|
|
dht::token_range{
|
|
dht::token_range::bound(prev_tok, false),
|
|
std::nullopt},
|
|
eps[0]);
|
|
ret.emplace_back(
|
|
dht::token_range{
|
|
std::nullopt,
|
|
dht::token_range::bound(tok, true)},
|
|
eps[0]);
|
|
}
|
|
prev_tok = tok;
|
|
}
|
|
co_return ret;
|
|
}
|
|
|
|
|
|
// A class for iterating over all the token ranges *owned* by this shard.
|
|
// To avoid code duplication, it is a template with two distinct cases -
|
|
// <primary> and <secondary>:
|
|
//
|
|
// In the <primary> case, we consider a token *owned* by this shard if:
|
|
// 1. This node is a replica for this token.
|
|
// 2. Moreover, this node is the *primary* replica of the token (i.e., the
|
|
// first replica in the ring).
|
|
// 3. In this node, this shard is responsible for this token.
|
|
// We will use this definition of which shard in the cluster owns which tokens
|
|
// to split the expiration scanner's work between all the shards of the
|
|
// system.
|
|
//
|
|
// In the <secondary> case, we consider a token *owned* by this shard if:
|
|
// 1. This node is the *secondary* replica for this token (i.e., the second
|
|
// replica in the ring).
|
|
// 2. The primary replica for this token is currently marked down.
|
|
// 3. In this node, this shard is responsible for this token.
|
|
// We use the <secondary> case to handle the possibility that some of the
|
|
// nodes in the system are down. A dead node will not be expiring
|
|
// the tokens owned by it, so we want the secondary owner to take over its
|
|
// primary ranges.
|
|
//
|
|
// FIXME: need to decide how to choose primary ranges in multi-DC setup!
|
|
// We could call get_primary_ranges_within_dc() below instead of get_primary_ranges().
|
|
// NOTICE: Iteration currently starts from a random token range in order to improve
|
|
// the chances of covering all ranges during a scan when restarts occur.
|
|
// A more deterministic way would be to regularly persist the scanning state,
|
|
// but that incurs overhead that we want to avoid if not needed.
|
|
//
|
|
// FIXME: Check if this algorithm is safe with tablet migration.
|
|
// https://github.com/scylladb/scylladb/issues/16567
|
|
|
|
// ranges_holder_primary holds just the primary ranges themselves
|
|
class ranges_holder_primary {
|
|
dht::token_range_vector _token_ranges;
|
|
public:
|
|
explicit ranges_holder_primary(dht::token_range_vector token_ranges) : _token_ranges(std::move(token_ranges)) {}
|
|
static future<ranges_holder_primary> make(const locator::vnode_effective_replication_map* erm, locator::host_id ep) {
|
|
co_return ranges_holder_primary(co_await erm->get_primary_ranges(ep));
|
|
}
|
|
std::size_t size() const { return _token_ranges.size(); }
|
|
const dht::token_range& operator[](std::size_t i) const {
|
|
return _token_ranges[i];
|
|
}
|
|
bool should_skip(std::size_t i) const {
|
|
return false;
|
|
}
|
|
};
|
|
// ranges_holder<secondary> holds the secondary token ranges plus each
|
|
// range's primary owner, needed to implement should_skip().
|
|
class ranges_holder_secondary {
|
|
std::vector<std::pair<dht::token_range, locator::host_id>> _token_ranges;
|
|
const gms::gossiper& _gossiper;
|
|
public:
|
|
explicit ranges_holder_secondary(std::vector<std::pair<dht::token_range, locator::host_id>> token_ranges, const gms::gossiper& g)
|
|
: _token_ranges(std::move(token_ranges))
|
|
, _gossiper(g) {}
|
|
static future<ranges_holder_secondary> make(const locator::vnode_effective_replication_map* erm, locator::host_id ep, const gms::gossiper& g) {
|
|
co_return ranges_holder_secondary(co_await get_secondary_ranges(erm, ep), g);
|
|
}
|
|
std::size_t size() const { return _token_ranges.size(); }
|
|
const dht::token_range& operator[](std::size_t i) const {
|
|
return _token_ranges[i].first;
|
|
}
|
|
// range i should be skipped if its primary owner is alive.
|
|
bool should_skip(std::size_t i) const {
|
|
return _gossiper.is_alive(_token_ranges[i].second);
|
|
}
|
|
};
|
|
|
|
// The token_ranges_owned_by_this_shard class is only used for vnodes, where the vnodes give a partition range for the entire node
|
|
// and such range still needs to be divided between the shards.
|
|
template<class primary_or_secondary_t>
|
|
class token_ranges_owned_by_this_shard {
|
|
schema_ptr _s;
|
|
locator::effective_replication_map_ptr _erm;
|
|
// _token_ranges will contain a list of token ranges owned by this node.
|
|
// We'll further need to split each such range to the pieces owned by
|
|
// the current shard, using _intersecter.
|
|
const primary_or_secondary_t _token_ranges;
|
|
// NOTICE: _range_idx is used modulo _token_ranges size when accessing
|
|
// the data to ensure that it doesn't go out of bounds
|
|
size_t _range_idx;
|
|
size_t _end_idx;
|
|
std::optional<dht::selective_token_range_sharder> _intersecter;
|
|
public:
|
|
token_ranges_owned_by_this_shard(schema_ptr s, primary_or_secondary_t token_ranges)
|
|
: _s(s)
|
|
, _erm(s->table().get_effective_replication_map())
|
|
, _token_ranges(std::move(token_ranges))
|
|
, _range_idx(random_offset(0, _token_ranges.size() - 1))
|
|
, _end_idx(_range_idx + _token_ranges.size())
|
|
{
|
|
tlogger.debug("Generating token ranges starting from base range {} of {}", _range_idx, _token_ranges.size());
|
|
}
|
|
|
|
// Return the next token_range owned by this shard, or nullopt when the
|
|
// iteration ends.
|
|
std::optional<dht::token_range> next() {
|
|
// We may need three or more iterations in the following loop if a
|
|
// vnode doesn't intersect with the given shard at all (such a small
|
|
// vnode is unlikely, but possible). The loop cannot be infinite
|
|
// because each iteration of the loop advances _range_idx.
|
|
for (;;) {
|
|
if (_intersecter) {
|
|
std::optional<dht::token_range> ret = _intersecter->next();
|
|
if (ret) {
|
|
return ret;
|
|
}
|
|
// done with this range, go to next one
|
|
++_range_idx;
|
|
_intersecter = std::nullopt;
|
|
}
|
|
if (_range_idx == _end_idx) {
|
|
return std::nullopt;
|
|
}
|
|
// If should_skip(), the range should be skipped. This happens for
|
|
// a secondary range whose primary owning node is still alive.
|
|
while (_token_ranges.should_skip(_range_idx % _token_ranges.size())) {
|
|
++_range_idx;
|
|
if (_range_idx == _end_idx) {
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
_intersecter.emplace(_erm->get_sharder(*_s), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
|
|
}
|
|
}
|
|
|
|
// Same as next(), just return a partition_range instead of token_range
|
|
std::optional<dht::partition_range> next_partition_range() {
|
|
std::optional<dht::token_range> ret = next();
|
|
if (ret) {
|
|
return dht::to_partition_range(*ret);
|
|
} else {
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
};
|
|
|
|
// Precomputed information needed to perform a scan on partition ranges
|
|
struct scan_ranges_context {
|
|
schema_ptr s;
|
|
bytes column_name;
|
|
std::optional<std::string> member;
|
|
|
|
service::client_state internal_client_state;
|
|
::shared_ptr<cql3::selection::selection> selection;
|
|
std::unique_ptr<service::query_state> query_state_ptr;
|
|
std::unique_ptr<cql3::query_options> query_options;
|
|
::lw_shared_ptr<query::read_command> command;
|
|
|
|
scan_ranges_context(schema_ptr s, service::storage_proxy& proxy, bytes column_name, std::optional<std::string> member)
|
|
: s(s)
|
|
, column_name(column_name)
|
|
, member(member)
|
|
, internal_client_state(service::client_state::internal_tag())
|
|
{
|
|
// FIXME: don't read the entire items - read only parts of it.
|
|
// We must read the key columns (to be able to delete) and also
|
|
// the requested attribute. If the requested attribute is a map's
|
|
// member we may be forced to read the entire map - but it would
|
|
// be good if we can read only the single item of the map - it
|
|
// should be possible (and a must for issue #7751!).
|
|
lw_shared_ptr<service::pager::paging_state> paging_state = nullptr;
|
|
auto regular_columns =
|
|
s->regular_columns() | std::views::transform(&column_definition::id)
|
|
| std::ranges::to<query::column_id_vector>();
|
|
selection = cql3::selection::selection::wildcard(s);
|
|
query::partition_slice::option_set opts = selection->get_query_options();
|
|
opts.set<query::partition_slice::option::allow_short_read>();
|
|
// It is important that the scan bypass cache to avoid polluting it:
|
|
opts.set<query::partition_slice::option::bypass_cache>();
|
|
std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
|
|
auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
|
|
command = ::make_lw_shared<query::read_command>(s->id(), s->version(), partition_slice, proxy.get_max_result_size(partition_slice), query::tombstone_limit(proxy.get_tombstone_limit()));
|
|
tracing::trace_state_ptr trace_state;
|
|
// NOTICE: empty_service_permit is used because the TTL service has fixed parallelism
|
|
query_state_ptr = std::make_unique<service::query_state>(internal_client_state, trace_state, empty_service_permit());
|
|
// FIXME: What should we do on multi-DC? Will we run the expiration on the same ranges on all
|
|
// DCs or only once for each range? If the latter, we need to change the CLs in the
|
|
// scanner and deleter.
|
|
db::consistency_level cl = db::consistency_level::LOCAL_QUORUM;
|
|
query_options = std::make_unique<cql3::query_options>(cl, std::vector<cql3::raw_value>{});
|
|
query_options = std::make_unique<cql3::query_options>(std::move(query_options), std::move(paging_state));
|
|
}
|
|
};
|
|
|
|
// Scan data in a list of token ranges in one table, looking for expired
|
|
// items and deleting them.
|
|
// Because of issue #9167, partition_ranges must have a single partition
|
|
// range for this code to work correctly.
|
|
static future<> scan_table_ranges(
|
|
service::storage_proxy& proxy,
|
|
const scan_ranges_context& scan_ctx,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
abort_source& abort_source,
|
|
named_semaphore& page_sem,
|
|
expiration_service::stats& expiration_stats)
|
|
{
|
|
const schema_ptr& s = scan_ctx.s;
|
|
throwing_assert(partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
|
|
auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
|
|
*scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
|
|
while (!p->is_exhausted()) {
|
|
if (abort_source.abort_requested()) {
|
|
co_return;
|
|
}
|
|
auto units = co_await get_units(page_sem, 1);
|
|
// We don't need to limit page size in number of rows because there is
|
|
// a builtin limit of the page's size in bytes. Setting this limit to
|
|
// 1 is useful for debugging the paging code with moderate-size data.
|
|
uint32_t limit = std::numeric_limits<uint32_t>::max();
|
|
// Read a page, and if that times out, try again after a small sleep.
|
|
// If we didn't catch the timeout exception, it would cause the scan
|
|
// be aborted and only be restarted at the next scanning period.
|
|
// If we retry too many times, give up and restart the scan later.
|
|
std::unique_ptr<cql3::result_set> rs;
|
|
for (int retries=0; ; retries++) {
|
|
try {
|
|
// FIXME: which timeout?
|
|
rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
|
|
break;
|
|
} catch(exceptions::read_timeout_exception&) {
|
|
tlogger.warn("expiration scanner read timed out, will retry: {}",
|
|
std::current_exception());
|
|
}
|
|
// If we didn't break out of this loop, add a minimal sleep
|
|
if (retries >= 10) {
|
|
// Don't get stuck forever asking the same page, maybe there's
|
|
// a bug or a real problem in several replicas. Give up on
|
|
// this scan and retry the scan from a random position later,
|
|
// in the next scan period.
|
|
throw runtime_exception("scanner thread failed after too many timeouts for the same page");
|
|
}
|
|
co_await sleep_abortable(std::chrono::seconds(1), abort_source);
|
|
}
|
|
auto rows = rs->rows();
|
|
auto meta = rs->get_metadata().get_names();
|
|
std::optional<unsigned> expiration_column;
|
|
for (unsigned i = 0; i < meta.size(); i++) {
|
|
const cql3::column_specification& col = *meta[i];
|
|
if (col.name->name() == scan_ctx.column_name) {
|
|
expiration_column = i;
|
|
break;
|
|
}
|
|
}
|
|
if (!expiration_column) {
|
|
continue;
|
|
}
|
|
for (const auto& row : rows) {
|
|
const managed_bytes_opt& cell = row[*expiration_column];
|
|
if (!cell) {
|
|
continue;
|
|
}
|
|
auto v = meta[*expiration_column]->type->deserialize(*cell);
|
|
bool expired = false;
|
|
// FIXME: don't recalculate "now" all the time
|
|
auto now = gc_clock::now();
|
|
if (scan_ctx.member) {
|
|
// In this case, the expiration-time attribute we're
|
|
// looking for is a member in a map, saved serialized
|
|
// into bytes using Alternator's serialization (basically
|
|
// a JSON serialized into bytes)
|
|
// FIXME: is it possible to find a specific member of a map
|
|
// without iterating through it like we do here and compare
|
|
// the key?
|
|
for (const auto& entry : value_cast<map_type_impl::native_type>(v)) {
|
|
std::string attr_name = value_cast<sstring>(entry.first);
|
|
if (value_cast<sstring>(entry.first) == *scan_ctx.member) {
|
|
bytes value = value_cast<bytes>(entry.second);
|
|
rjson::value json = deserialize_item(value);
|
|
expired = is_expired(json, now);
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
// For a real column to contain an expiration time, it
|
|
// must be a numeric type. We currently support decimal
|
|
// (used by Alternator TTL) as well as bigint, int and
|
|
// timestamp (used by CQL per-row TTL).
|
|
switch (meta[*expiration_column]->type->get_kind()) {
|
|
case abstract_type::kind::decimal:
|
|
// Used by Alternator TTL for key columns not stored
|
|
// in the map. The value is in seconds, fractional
|
|
// part is ignored.
|
|
expired = is_expired(value_cast<big_decimal>(v), now);
|
|
break;
|
|
case abstract_type::kind::long_kind:
|
|
// Used by CQL per-row TTL. The value is in seconds.
|
|
expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int64_t>(v))), now);
|
|
break;
|
|
case abstract_type::kind::int32:
|
|
// Used by CQL per-row TTL. The value is in seconds.
|
|
// Using int type is not recommended because it will
|
|
// overflow in 2038, but we support it to allow users
|
|
// to use existing int columns for expiration.
|
|
expired = is_expired(gc_clock::time_point(std::chrono::seconds(value_cast<int32_t>(v))), now);
|
|
break;
|
|
case abstract_type::kind::timestamp:
|
|
// Used by CQL per-row TTL. The value is in milliseconds
|
|
// but we truncate it to gc_clock's precision (whole seconds).
|
|
expired = is_expired(gc_clock::time_point(std::chrono::duration_cast<gc_clock::duration>(value_cast<db_clock::time_point>(v).time_since_epoch())), now);
|
|
break;
|
|
default:
|
|
// Should never happen - we verified the column's type
|
|
// before starting the scan.
|
|
[[unlikely]]
|
|
on_internal_error(tlogger, format("expiration scanner value of unsupported type {} in column {}", meta[*expiration_column]->type->cql3_type_name(), scan_ctx.column_name) );
|
|
}
|
|
}
|
|
if (expired) {
|
|
expiration_stats.items_deleted++;
|
|
// FIXME: maybe don't recalculate new_timestamp() all the time
|
|
// FIXME: if expire_item() throws on timeout, we need to retry it.
|
|
auto ts = api::new_timestamp();
|
|
co_await expire_item(proxy, *scan_ctx.query_state_ptr, row, s, ts);
|
|
}
|
|
}
|
|
// FIXME: once in a while, persist p->state(), so on reboot
|
|
// we don't start from scratch.
|
|
}
|
|
}
|
|
|
|
static future<> scan_tablet(locator::tablet_id tablet, service::storage_proxy& proxy, abort_source& abort_source, named_semaphore& page_sem,
|
|
expiration_service::stats& expiration_stats, const scan_ranges_context& scan_ctx, const locator::tablet_map& tablet_map) {
|
|
auto tablet_token_range = tablet_map.get_token_range(tablet);
|
|
dht::ring_position tablet_start(tablet_token_range.start()->value(), dht::ring_position::token_bound::start),
|
|
tablet_end(tablet_token_range.end()->value(), dht::ring_position::token_bound::end);
|
|
auto partition_range = dht::partition_range::make(std::move(tablet_start), std::move(tablet_end));
|
|
// Note that because of issue #9167 we need to run a separate query on each partition range, and can't pass
|
|
// several of them into one partition_range_vector that is passed to scan_table_ranges().
|
|
return scan_table_ranges(proxy, scan_ctx, {partition_range}, abort_source, page_sem, expiration_stats);
|
|
}
|
|
|
|
// scan_table() scans, in one table, data "owned" by this shard, looking for
|
|
// expired items and deleting them.
|
|
// We consider each node to "own" its primary token ranges, i.e., the tokens
|
|
// that this node is their first replica in the ring. Inside the node, each
|
|
// shard "owns" subranges of the node's token ranges - according to the node's
|
|
// sharding algorithm.
|
|
// When a node goes down, the token ranges owned by it will not be scanned
|
|
// and items in those token ranges will not expire, so in the future (FIXME)
|
|
// this function should additionally work on token ranges whose primary owner
|
|
// is down and this node is the range's secondary owner.
|
|
// If the TTL (expiration-time scanning) feature is not enabled for this
|
|
// table, scan_table() returns false without doing anything. Remember that the
|
|
// TTL feature may be enabled later so this function will need to be called
|
|
// again when the feature is enabled.
|
|
// Currently this function scans the entire table (or, rather the parts owned
|
|
// by this shard) at full rate, once. In the future (FIXME) we should consider
|
|
// how to pace this scan, how and when to repeat it, how to interleave or
|
|
// parallelize scanning of multiple tables, and how to continue scans after a
|
|
// reboot.
|
|
static future<bool> scan_table(
|
|
service::storage_proxy& proxy,
|
|
data_dictionary::database db,
|
|
gms::gossiper& gossiper,
|
|
schema_ptr s,
|
|
abort_source& abort_source,
|
|
named_semaphore& page_sem,
|
|
expiration_service::stats& expiration_stats)
|
|
{
|
|
// Check if an expiration-time attribute is enabled for this table.
|
|
// If not, just return false immediately.
|
|
// FIXME: the setting of the TTL may change in the middle of a long scan!
|
|
std::optional<std::string> attribute_name = db::find_tag(*s, TTL_TAG_KEY);
|
|
if (!attribute_name) {
|
|
co_return false;
|
|
}
|
|
// attribute_name may be one of the schema's columns (in Alternator, this
|
|
// means a key column, in CQL it's a regular column), or an element in
|
|
// Alternator's attrs map encoded in Alternator's JSON encoding (which we
|
|
// decode). If attribute_name is a real column, in Alternator it will have
|
|
// the type decimal, counting seconds since the UNIX epoch, while in CQL
|
|
// it will one of the types bigint or int (counting seconds) or timestamp
|
|
// (counting milliseconds).
|
|
bytes column_name = to_bytes(*attribute_name);
|
|
const column_definition *cd = s->get_column_definition(column_name);
|
|
std::optional<std::string> member;
|
|
if (!cd) {
|
|
member = std::move(attribute_name);
|
|
column_name = bytes(executor::ATTRS_COLUMN_NAME);
|
|
cd = s->get_column_definition(column_name);
|
|
tlogger.info("table {} TTL enabled with attribute {} in {}", s->cf_name(), *member, executor::ATTRS_COLUMN_NAME);
|
|
} else {
|
|
tlogger.info("table {} TTL enabled with attribute {}", s->cf_name(), *attribute_name);
|
|
}
|
|
if (!cd) {
|
|
tlogger.info("table {} TTL column is missing, not scanning", s->cf_name());
|
|
co_return false;
|
|
}
|
|
data_type column_type = cd->type;
|
|
// Verify that the column has the right type: If "member" exists
|
|
// the column must be a map, and if it doesn't, the column must
|
|
// be decimal_type (Alternator), bigint, int or timestamp (CQL).
|
|
// If the column has the wrong type nothing can get expired in
|
|
// this table, and it's pointless to scan it.
|
|
if ((member && column_type->get_kind() != abstract_type::kind::map) ||
|
|
(!member && column_type->get_kind() != abstract_type::kind::decimal &&
|
|
column_type->get_kind() != abstract_type::kind::long_kind &&
|
|
column_type->get_kind() != abstract_type::kind::int32 &&
|
|
column_type->get_kind() != abstract_type::kind::timestamp)) {
|
|
tlogger.info("table {} TTL column has unsupported type, not scanning", s->cf_name());
|
|
co_return false;
|
|
}
|
|
expiration_stats.scan_table++;
|
|
// FIXME: need to pace the scan, not do it all at once.
|
|
scan_ranges_context scan_ctx{s, proxy, std::move(column_name), std::move(member)};
|
|
|
|
if (s->table().uses_tablets()) {
|
|
locator::effective_replication_map_ptr erm = s->table().get_effective_replication_map();
|
|
auto my_host_id = erm->get_topology().my_host_id();
|
|
const auto &tablet_map = erm->get_token_metadata().tablets().get_tablet_map(s->id());
|
|
for (std::optional tablet = tablet_map.first_tablet(); tablet; tablet = tablet_map.next_tablet(*tablet)) {
|
|
auto tablet_primary_replica = tablet_map.get_primary_replica(*tablet, erm->get_topology());
|
|
// check if this is the primary replica for the current tablet
|
|
if (tablet_primary_replica.host == my_host_id && tablet_primary_replica.shard == this_shard_id()) {
|
|
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
|
|
} else if(erm->get_replication_factor() > 1) {
|
|
// Check if this is the secondary replica for the current tablet
|
|
// and if the primary replica is down which means we will take over this work.
|
|
// If each node only scans its own primary ranges, then when any node is
|
|
// down part of the token range will not get scanned. This can be viewed
|
|
// as acceptable (when the comes back online, it will resume its scan),
|
|
// but as noted in issue #9787, we can allow more prompt expiration
|
|
// by tasking another node to take over scanning of the dead node's primary
|
|
// ranges. What we do here is that this node will also check expiration
|
|
// on its *secondary* ranges - but only those whose primary owner is down.
|
|
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
|
|
if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
|
|
if (!gossiper.is_alive(tablet_primary_replica.host)) {
|
|
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else { // VNodes
|
|
locator::static_effective_replication_map_ptr ermp =
|
|
db.real_database().find_keyspace(s->ks_name()).get_static_effective_replication_map();
|
|
auto* erm = ermp->maybe_as_vnode_effective_replication_map();
|
|
if (!erm) {
|
|
on_internal_error(tlogger, format("Keyspace {} is local", s->ks_name()));
|
|
}
|
|
auto my_host_id = erm->get_topology().my_host_id();
|
|
token_ranges_owned_by_this_shard my_ranges(s, co_await ranges_holder_primary::make(erm, my_host_id));
|
|
while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
|
|
// Note that because of issue #9167 we need to run a separate
|
|
// query on each partition range, and can't pass several of
|
|
// them into one partition_range_vector.
|
|
dht::partition_range_vector partition_ranges;
|
|
partition_ranges.push_back(std::move(*range));
|
|
// FIXME: if scanning a single range fails, including network errors,
|
|
// we fail the entire scan (and rescan from the beginning). Need to
|
|
// reconsider this. Saving the scan position might be a good enough
|
|
// solution for this problem.
|
|
co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
|
|
}
|
|
// If each node only scans its own primary ranges, then when any node is
|
|
// down part of the token range will not get scanned. This can be viewed
|
|
// as acceptable (when the comes back online, it will resume its scan),
|
|
// but as noted in issue #9787, we can allow more prompt expiration
|
|
// by tasking another node to take over scanning of the dead node's primary
|
|
// ranges. What we do here is that this node will also check expiration
|
|
// on its *secondary* ranges - but only those whose primary owner is down.
|
|
token_ranges_owned_by_this_shard my_secondary_ranges(s, co_await ranges_holder_secondary::make(erm, my_host_id, gossiper));
|
|
while (std::optional<dht::partition_range> range = my_secondary_ranges.next_partition_range()) {
|
|
expiration_stats.secondary_ranges_scanned++;
|
|
dht::partition_range_vector partition_ranges;
|
|
partition_ranges.push_back(std::move(*range));
|
|
co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
|
|
}
|
|
}
|
|
co_return true;
|
|
}
|
|
|
|
|
|
future<> expiration_service::run() {
|
|
// FIXME: don't just tight-loop, think about timing, pace, and
|
|
// store position in durable storage, etc.
|
|
// FIXME: think about working on different tables in parallel.
|
|
// also need to notice when a new table is added, a table is
|
|
// deleted or when ttl is enabled or disabled for a table!
|
|
for (;;) {
|
|
auto start = lowres_clock::now();
|
|
// _db.tables() may change under our feet during a
|
|
// long-living loop, so we must keep our own copy of the list of
|
|
// schemas.
|
|
std::vector<schema_ptr> schemas;
|
|
for (auto cf : _db.get_tables()) {
|
|
schemas.push_back(cf.schema());
|
|
}
|
|
for (schema_ptr s : schemas) {
|
|
co_await coroutine::maybe_yield();
|
|
if (shutting_down()) {
|
|
co_return;
|
|
}
|
|
try {
|
|
co_await scan_table(_proxy, _db, _gossiper, s, _abort_source, _page_sem, _expiration_stats);
|
|
} catch (...) {
|
|
// The scan of a table may fail in the middle for many
|
|
// reasons, including network failure and even the table
|
|
// being removed. We'll continue scanning this table later
|
|
// (if it still exists). In any case it's important to catch
|
|
// the exception and not let the scanning service die for
|
|
// good.
|
|
// If the table has been deleted, it is expected that the scan
|
|
// will fail at some point, and even a warning is excessive.
|
|
if (_db.has_schema(s->ks_name(), s->cf_name())) {
|
|
tlogger.warn("table {}.{} expiration scan failed: {}",
|
|
s->ks_name(), s->cf_name(), std::current_exception());
|
|
} else {
|
|
tlogger.info("expiration scan failed when table {}.{} was deleted",
|
|
s->ks_name(), s->cf_name());
|
|
}
|
|
}
|
|
}
|
|
_expiration_stats.scan_passes++;
|
|
// The TTL scanner runs above once over all tables, at full steam.
|
|
// After completing such a scan, we sleep until it's time start
|
|
// another scan. TODO: If the scan went too fast, we can slow it down
|
|
// in the next iteration by reducing the scanner's scheduling-group
|
|
// share (if using a separate scheduling group), or introduce
|
|
// finer-grain sleeps into the scanning code.
|
|
std::chrono::milliseconds scan_duration(std::chrono::duration_cast<std::chrono::milliseconds>(lowres_clock::now() - start));
|
|
std::chrono::milliseconds period(long(_db.get_config().alternator_ttl_period_in_seconds() * 1000));
|
|
if (scan_duration < period) {
|
|
try {
|
|
tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count()/1000.0);
|
|
co_await seastar::sleep_abortable(period - scan_duration, _abort_source);
|
|
} catch(seastar::sleep_aborted&) {}
|
|
} else {
|
|
tlogger.warn("scan took {} seconds, longer than period - not sleeping", scan_duration.count()/1000.0);
|
|
}
|
|
}
|
|
}
|
|
|
|
future<> expiration_service::start() {
|
|
// Called by main() on each shard to start the expiration-service
|
|
// thread. Just runs run() in the background and allows stop().
|
|
if (!shutting_down()) {
|
|
_end = run().handle_exception([] (std::exception_ptr ep) {
|
|
tlogger.error("expiration_service failed: {}", ep);
|
|
});
|
|
}
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> expiration_service::stop() {
|
|
if (_abort_source.abort_requested()) {
|
|
throw std::logic_error("expiration_service::stop() called a second time");
|
|
}
|
|
_abort_source.request_abort();
|
|
if (!_end) {
|
|
// if _end is was not set, start() was never called
|
|
return make_ready_future<>();
|
|
}
|
|
return std::move(*_end);
|
|
}
|
|
|
|
expiration_service::stats::stats() {
|
|
_metrics.add_group("expiration", {
|
|
seastar::metrics::make_total_operations("scan_passes", scan_passes,
|
|
seastar::metrics::description("number of passes over the database"))(alternator_label).set_skip_when_empty(),
|
|
seastar::metrics::make_total_operations("scan_table", scan_table,
|
|
seastar::metrics::description("number of table scans (counting each scan of each table that enabled expiration)"))(alternator_label).set_skip_when_empty(),
|
|
seastar::metrics::make_total_operations("items_deleted", items_deleted,
|
|
seastar::metrics::description("number of items deleted after expiration"))(basic_level)(alternator_label).set_skip_when_empty(),
|
|
seastar::metrics::make_total_operations("secondary_ranges_scanned", secondary_ranges_scanned,
|
|
seastar::metrics::description("number of token ranges scanned by this node while their primary owner was down"))(alternator_label).set_skip_when_empty(),
|
|
});
|
|
}
|
|
|
|
|
|
} // namespace alternator
|