There's proxy at hand which can provide local gossiper reference Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
822 lines
39 KiB
C++
822 lines
39 KiB
C++
/*
|
|
* Copyright 2021-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include <chrono>
|
|
#include <cstdint>
|
|
#include <optional>
|
|
#include <seastar/core/sstring.hh>
|
|
#include <seastar/core/coroutine.hh>
|
|
#include <seastar/core/sleep.hh>
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/lowres_clock.hh>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
#include <boost/multiprecision/cpp_int.hpp>
|
|
|
|
#include "gms/gossiper.hh"
|
|
#include "gms/inet_address.hh"
|
|
#include "inet_address_vectors.hh"
|
|
#include "locator/abstract_replication_strategy.hh"
|
|
#include "log.hh"
|
|
#include "gc_clock.hh"
|
|
#include "replica/database.hh"
|
|
#include "service_permit.hh"
|
|
#include "timestamp.hh"
|
|
#include "service/storage_proxy.hh"
|
|
#include "service/pager/paging_state.hh"
|
|
#include "service/pager/query_pagers.hh"
|
|
#include "gms/feature_service.hh"
|
|
#include "sstables/types.hh"
|
|
#include "mutation.hh"
|
|
#include "types.hh"
|
|
#include "types/map.hh"
|
|
#include "utils/rjson.hh"
|
|
#include "utils/big_decimal.hh"
|
|
#include "utils/fb_utilities.hh"
|
|
#include "cql3/selection/selection.hh"
|
|
#include "cql3/values.hh"
|
|
#include "cql3/query_options.hh"
|
|
#include "cql3/column_identifier.hh"
|
|
#include "alternator/executor.hh"
|
|
#include "alternator/controller.hh"
|
|
#include "alternator/serialization.hh"
|
|
#include "dht/sharder.hh"
|
|
#include "db/config.hh"
|
|
|
|
#include "ttl.hh"
|
|
|
|
static logging::logger tlogger("alternator_ttl");
|
|
|
|
namespace alternator {
|
|
|
|
// We write the expiration-time attribute enabled on a table using a
|
|
// tag TTL_TAG_KEY.
|
|
// Currently, the *value* of this tag is simply the name of the attribute,
|
|
// and the expiration scanner interprets it as an Alternator attribute name -
|
|
// It can refer to a real column or if that doesn't exist, to a member of
|
|
// the ":attrs" map column. Although this is designed for Alternator, it may
|
|
// be good enough for CQL as well (there, the ":attrs" column won't exist).
|
|
static const sstring TTL_TAG_KEY("system:ttl_attribute");
|
|
|
|
future<executor::request_return_type> executor::update_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.update_time_to_live++;
|
|
if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
|
|
co_return api_error::unknown_operation("UpdateTimeToLive not yet supported. Experimental support is available if the 'alternator-ttl' experimental feature is enabled on all nodes.");
|
|
}
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
rjson::value* spec = rjson::find(request, "TimeToLiveSpecification");
|
|
if (!spec || !spec->IsObject()) {
|
|
co_return api_error::validation("UpdateTimeToLive missing mandatory TimeToLiveSpecification");
|
|
}
|
|
const rjson::value* v = rjson::find(*spec, "Enabled");
|
|
if (!v || !v->IsBool()) {
|
|
co_return api_error::validation("UpdateTimeToLive requires boolean Enabled");
|
|
}
|
|
bool enabled = v->GetBool();
|
|
v = rjson::find(*spec, "AttributeName");
|
|
if (!v || !v->IsString()) {
|
|
co_return api_error::validation("UpdateTimeToLive requires string AttributeName");
|
|
}
|
|
// Although the DynamoDB documentation specifies that attribute names
|
|
// should be between 1 and 64K bytes, in practice, it only allows
|
|
// between 1 and 255 bytes. There are no other limitations on which
|
|
// characters are allowed in the name.
|
|
if (v->GetStringLength() < 1 || v->GetStringLength() > 255) {
|
|
co_return api_error::validation("The length of AttributeName must be between 1 and 255");
|
|
}
|
|
sstring attribute_name(v->GetString(), v->GetStringLength());
|
|
|
|
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
|
if (enabled) {
|
|
if (tags_map.contains(TTL_TAG_KEY)) {
|
|
co_return api_error::validation("TTL is already enabled");
|
|
}
|
|
tags_map[TTL_TAG_KEY] = attribute_name;
|
|
} else {
|
|
auto i = tags_map.find(TTL_TAG_KEY);
|
|
if (i == tags_map.end()) {
|
|
co_return api_error::validation("TTL is already disabled");
|
|
} else if (i->second != attribute_name) {
|
|
co_return api_error::validation(format(
|
|
"Requested to disable TTL on attribute {}, but a different attribute {} is enabled.",
|
|
attribute_name, i->second));
|
|
}
|
|
tags_map.erase(TTL_TAG_KEY);
|
|
}
|
|
co_await update_tags(_mm, schema, std::move(tags_map));
|
|
// Prepare the response, which contains a TimeToLiveSpecification
|
|
// basically identical to the request's
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TimeToLiveSpecification", std::move(*spec));
|
|
co_return make_jsonable(std::move(response));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.describe_time_to_live++;
|
|
if (!_proxy.data_dictionary().features().cluster_supports_alternator_ttl()) {
|
|
co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
|
|
}
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
|
rjson::value desc = rjson::empty_object();
|
|
auto i = tags_map.find(TTL_TAG_KEY);
|
|
if (i == tags_map.end()) {
|
|
rjson::add(desc, "TimeToLiveStatus", "DISABLED");
|
|
} else {
|
|
rjson::add(desc, "TimeToLiveStatus", "ENABLED");
|
|
rjson::add(desc, "AttributeName", rjson::from_string(i->second));
|
|
}
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TimeToLiveDescription", std::move(desc));
|
|
co_return make_jsonable(std::move(response));
|
|
}
|
|
|
|
// expiration_service is a sharded service responsible for cleaning up expired
|
|
// items in all tables with per-item expiration enabled. Currently, this means
|
|
// Alternator tables with TTL configured via a UpdateTimeToLeave request.
|
|
//
|
|
// Here is a brief overview of how the expiration service works:
|
|
//
|
|
// An expiration thread on each shard periodically scans the items (i.e.,
|
|
// rows) owned by this shard, looking for items whose chosen expiration-time
|
|
// attribute indicates they are expired, and deletes those items.
|
|
// The expiration-time "attribute" can be either an actual Scylla column
|
|
// (must be numeric) or an Alternator "attribute" - i.e., an element in
|
|
// the ATTRS_COLUMN_NAME map<utf8,bytes> column where the numeric expiration
|
|
// time is encoded in DynamoDB's JSON encoding inside the bytes value.
|
|
// To avoid scanning the same items RF times in RF replicas, only one node is
|
|
// responsible for scanning a token range at a time. Normally, this is the
|
|
// node owning this range as a "primary range" (the first node in the ring
|
|
// with this range), but when this node is down, other nodes may take over
|
|
// (FIXME: this is not implemented yet).
|
|
// An expiration thread is reponsible for all tables which need expiration
|
|
// scans. FIXME: explain how this is done with multiple tables - parallel,
|
|
// staggered, or what?
|
|
// The expiration thread scans item using CL=QUORUM to ensures that it reads
|
|
// a consistent expiration-time attribute. This means that the items are read
|
|
// locally and in addition QUORUM-1 additional nodes (one additional node
|
|
// when RF=3) need to read the data and send digests.
|
|
// FIXME: explain if we can read the exact attribute or the entire map.
|
|
// When the expiration thread decides that an item has expired and wants
|
|
// to delete it, it does it using a CL=QUORUM write. This allows this
|
|
// deletion to be visible for consistent (quorum) reads. The deletion,
|
|
// like user deletions, will also appear on the CDC log and therefore
|
|
// Alternator Streams if enabled (FIXME: explain how we mark the
|
|
// deletion different from user deletes. We don't do it yet.).
|
|
expiration_service::expiration_service(data_dictionary::database db, service::storage_proxy& proxy)
|
|
: _db(db)
|
|
, _proxy(proxy)
|
|
{
|
|
}
|
|
|
|
// Convert the big_decimal used to represent expiration time to an integer.
|
|
// Any fractional part is dropped. If the number is negative or invalid,
|
|
// 0 is returned, and if it's too high, the maximum unsigned long is returned.
|
|
static unsigned long bigdecimal_to_ul(const big_decimal& bd) {
|
|
// The big_decimal format has an integer mantissa of arbitrary length
|
|
// "unscaled_value" and then a (power of 10) exponent "scale".
|
|
if (bd.unscaled_value() <= 0) {
|
|
return 0;
|
|
}
|
|
if (bd.scale() == 0) {
|
|
// The fast path, when the expiration time is an integer, scale==0.
|
|
return static_cast<unsigned long>(bd.unscaled_value());
|
|
}
|
|
// Because the mantissa can be of arbitrary length, we work on it
|
|
// as a string. TODO: find a less ugly algorithm.
|
|
auto str = bd.unscaled_value().str();
|
|
if (bd.scale() > 0) {
|
|
int len = str.length();
|
|
if (len < bd.scale()) {
|
|
return 0;
|
|
}
|
|
str = str.substr(0, len-bd.scale());
|
|
} else {
|
|
if (bd.scale() < -20) {
|
|
return std::numeric_limits<unsigned long>::max();
|
|
}
|
|
for (int i = 0; i < -bd.scale(); i++) {
|
|
str.push_back('0');
|
|
}
|
|
}
|
|
// strtoul() returns ULONG_MAX if the number is too large, or 0 if not
|
|
// a number.
|
|
return strtoul(str.c_str(), nullptr, 10);
|
|
}
|
|
|
|
// The following is_expired() functions all check if an item with the given
|
|
// expiration time has expired, according to the DynamoDB API rules.
|
|
// The rules are:
|
|
// 1. If the expiration time attribute's value is not a number type,
|
|
// the item is not expired.
|
|
// 2. The expiration time is measured in seconds since the UNIX epoch.
|
|
// 3. If the expiration time is more than 5 years in the past, it is assumed
|
|
// to be malformed and ignored - and the item does not expire.
|
|
static bool is_expired(gc_clock::time_point expiration_time, gc_clock::time_point now) {
|
|
return expiration_time <= now &&
|
|
expiration_time > now - std::chrono::years(5);
|
|
}
|
|
|
|
static bool is_expired(const big_decimal& expiration_time, gc_clock::time_point now) {
|
|
unsigned long t = bigdecimal_to_ul(expiration_time);
|
|
// We assume - and the assumption turns out to be correct - that the
|
|
// epoch of gc_clock::time_point and the one used by the DynamoDB protocol
|
|
// are the same (the UNIX epoch in UTC). The resolution (seconds) is also
|
|
// the same.
|
|
return is_expired(gc_clock::time_point(gc_clock::duration(std::chrono::seconds(t))), now);
|
|
}
|
|
static bool is_expired(const rjson::value& expiration_time, gc_clock::time_point now) {
|
|
std::optional<big_decimal> n = try_unwrap_number(expiration_time);
|
|
return n && is_expired(*n, now);
|
|
}
|
|
|
|
// expire_item() expires an item - i.e., deletes it as appropriate for
|
|
// expiration - with CL=QUORUM and (FIXME!) in a way Alternator Streams
|
|
// understands it is an expiration event - not a user-initiated deletion.
|
|
static future<> expire_item(service::storage_proxy& proxy,
|
|
const service::query_state& qs,
|
|
const std::vector<bytes_opt>& row,
|
|
schema_ptr schema,
|
|
api::timestamp_type ts) {
|
|
// Prepare the row key to delete
|
|
// NOTICE: the order of columns is guaranteed by the fact that selection::wildcard
|
|
// is used, which indicates that columns appear in the order defined by
|
|
// schema::all_columns_in_select_order() - partition key columns goes first,
|
|
// immediately followed by clustering key columns
|
|
std::vector<bytes> exploded_pk;
|
|
const unsigned pk_size = schema->partition_key_size();
|
|
const unsigned ck_size = schema->clustering_key_size();
|
|
for (unsigned c = 0; c < pk_size; ++c) {
|
|
const auto& row_c = row[c];
|
|
if (!row_c) {
|
|
// This shouldn't happen - all key columns must have values.
|
|
// But if it ever happens, let's just *not* expire the item.
|
|
// FIXME: log or increment a metric if this happens.
|
|
return make_ready_future<>();
|
|
}
|
|
exploded_pk.push_back(*row_c);
|
|
}
|
|
auto pk = partition_key::from_exploded(exploded_pk);
|
|
mutation m(schema, pk);
|
|
// If there's no clustering key, a tombstone should be created directly
|
|
// on a partition, not on a clustering row - otherwise it will look like
|
|
// an open-ended range tombstone, which will crash on KA/LA sstable format.
|
|
// See issue #6035
|
|
if (ck_size == 0) {
|
|
m.partition().apply(tombstone(ts, gc_clock::now()));
|
|
} else {
|
|
std::vector<bytes> exploded_ck;
|
|
for (unsigned c = pk_size; c < pk_size + ck_size; ++c) {
|
|
const auto& row_c = row[c];
|
|
if (!row_c) {
|
|
// This shouldn't happen - all key columns must have values.
|
|
// But if it ever happens, let's just *not* expire the item.
|
|
// FIXME: log or increment a metric if this happens.
|
|
return make_ready_future<>();
|
|
}
|
|
exploded_ck.push_back(*row_c);
|
|
}
|
|
auto ck = clustering_key::from_exploded(exploded_ck);
|
|
m.partition().clustered_row(*schema, ck).apply(tombstone(ts, gc_clock::now()));
|
|
}
|
|
return proxy.mutate(std::vector<mutation>{std::move(m)},
|
|
db::consistency_level::LOCAL_QUORUM,
|
|
executor::default_timeout(), // FIXME - which timeout?
|
|
qs.get_trace_state(), qs.get_permit());
|
|
}
|
|
|
|
static size_t random_offset(size_t min, size_t max) {
|
|
static thread_local std::default_random_engine re{std::random_device{}()};
|
|
std::uniform_int_distribution<size_t> dist(min, max);
|
|
return dist(re);
|
|
}
|
|
|
|
// Get a list of secondary token ranges for the given node, and the primary
|
|
// node responsible for each of these token ranges.
|
|
// A "secondary range" is a range of tokens where for each token, the second
|
|
// node (in ring order) out of the RF replicas that hold this token is the
|
|
// given node.
|
|
// In the expiration scanner, we want to scan a secondary range but only if
|
|
// this range's primary node is down. For this we need to return not just
|
|
// a list of this node's secondary ranges - but also the primary owner of
|
|
// each of those ranges.
|
|
static std::vector<std::pair<dht::token_range, gms::inet_address>> get_secondary_ranges(
|
|
const locator::effective_replication_map_ptr& erm,
|
|
gms::inet_address ep) {
|
|
const auto& tm = *erm->get_token_metadata_ptr();
|
|
const auto& sorted_tokens = tm.sorted_tokens();
|
|
std::vector<std::pair<dht::token_range, gms::inet_address>> ret;
|
|
if (sorted_tokens.empty()) {
|
|
on_internal_error(tlogger, "Token metadata is empty");
|
|
}
|
|
auto prev_tok = sorted_tokens.back();
|
|
for (const auto& tok : sorted_tokens) {
|
|
inet_address_vector_replica_set eps = erm->get_natural_endpoints(tok);
|
|
if (eps.size() <= 1 || eps[1] != ep) {
|
|
prev_tok = tok;
|
|
continue;
|
|
}
|
|
// Add the range (prev_tok, tok] to ret. However, if the range wraps
|
|
// around, split it to two non-wrapping ranges.
|
|
if (prev_tok < tok) {
|
|
ret.emplace_back(
|
|
dht::token_range{
|
|
dht::token_range::bound(prev_tok, false),
|
|
dht::token_range::bound(tok, true)},
|
|
eps[0]);
|
|
} else {
|
|
ret.emplace_back(
|
|
dht::token_range{
|
|
dht::token_range::bound(prev_tok, false),
|
|
std::nullopt},
|
|
eps[0]);
|
|
ret.emplace_back(
|
|
dht::token_range{
|
|
std::nullopt,
|
|
dht::token_range::bound(tok, true)},
|
|
eps[0]);
|
|
}
|
|
prev_tok = tok;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
// A class for iterating over all the token ranges *owned* by this shard.
|
|
// To avoid code duplication, it is a template with two distinct cases -
|
|
// <primary> and <secondary>:
|
|
//
|
|
// In the <primary> case, we consider a token *owned* by this shard if:
|
|
// 1. This node is a replica for this token.
|
|
// 2. Moreover, this node is the *primary* replica of the token (i.e., the
|
|
// first replica in the ring).
|
|
// 3. In this node, this shard is responsible for this token.
|
|
// We will use this definition of which shard in the cluster owns which tokens
|
|
// to split the expiration scanner's work between all the shards of the
|
|
// system.
|
|
//
|
|
// In the <secondary> case, we consider a token *owned* by this shard if:
|
|
// 1. This node is the *secondary* replica for this token (i.e., the second
|
|
// replica in the ring).
|
|
// 2. The primary replica for this token is currently marked down.
|
|
// 3. In this node, this shard is responsible for this token.
|
|
// We use the <secondary> case to handle the possibility that some of the
|
|
// nodes in the system are down. A dead node will not be expiring expiring
|
|
// the tokens owned by it, so we want the secondary owner to take over its
|
|
// primary ranges.
|
|
//
|
|
// FIXME: need to decide how to choose primary ranges in multi-DC setup!
|
|
// We could call get_primary_ranges_within_dc() below instead of get_primary_ranges().
|
|
// NOTICE: Iteration currently starts from a random token range in order to improve
|
|
// the chances of covering all ranges during a scan when restarts occur.
|
|
// A more deterministic way would be to regularly persist the scanning state,
|
|
// but that incurs overhead that we want to avoid if not needed.
|
|
enum primary_or_secondary_t {primary, secondary};
|
|
template<primary_or_secondary_t primary_or_secondary>
|
|
class token_ranges_owned_by_this_shard {
|
|
// ranges_holder_primary holds just the primary ranges themselves
|
|
class ranges_holder_primary {
|
|
const dht::token_range_vector _token_ranges;
|
|
public:
|
|
ranges_holder_primary(const locator::effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
|
|
: _token_ranges(erm->get_primary_ranges(ep)) {}
|
|
std::size_t size() const { return _token_ranges.size(); }
|
|
const dht::token_range& operator[](std::size_t i) const {
|
|
return _token_ranges[i];
|
|
}
|
|
bool should_skip(std::size_t i) const {
|
|
return false;
|
|
}
|
|
};
|
|
// ranges_holder<secondary> holds the secondary token ranges plus each
|
|
// range's primary owner, needed to implement should_skip().
|
|
class ranges_holder_secondary {
|
|
std::vector<std::pair<dht::token_range, gms::inet_address>> _token_ranges;
|
|
gms::gossiper& _gossiper;
|
|
public:
|
|
ranges_holder_secondary(const locator::effective_replication_map_ptr& erm, gms::gossiper& g, gms::inet_address ep)
|
|
: _token_ranges(get_secondary_ranges(erm, ep))
|
|
, _gossiper(g) {}
|
|
std::size_t size() const { return _token_ranges.size(); }
|
|
const dht::token_range& operator[](std::size_t i) const {
|
|
return _token_ranges[i].first;
|
|
}
|
|
// range i should be skipped if its primary owner is alive.
|
|
bool should_skip(std::size_t i) const {
|
|
return _gossiper.is_alive(_token_ranges[i].second);
|
|
}
|
|
};
|
|
|
|
schema_ptr _s;
|
|
// _token_ranges will contain a list of token ranges owned by this node.
|
|
// We'll further need to split each such range to the pieces owned by
|
|
// the current shard, using _intersecter.
|
|
using ranges_holder = std::conditional_t<
|
|
primary_or_secondary == primary_or_secondary_t::primary,
|
|
ranges_holder_primary,
|
|
ranges_holder_secondary>;
|
|
const ranges_holder _token_ranges;
|
|
// NOTICE: _range_idx is used modulo _token_ranges size when accessing
|
|
// the data to ensure that it doesn't go out of bounds
|
|
size_t _range_idx;
|
|
size_t _end_idx;
|
|
std::optional<dht::selective_token_range_sharder> _intersecter;
|
|
public:
|
|
token_ranges_owned_by_this_shard(replica::database& db, gms::gossiper& g, schema_ptr s)
|
|
: _s(s)
|
|
, _token_ranges(db.find_keyspace(s->ks_name()).get_effective_replication_map(),
|
|
g, utils::fb_utilities::get_broadcast_address())
|
|
, _range_idx(random_offset(0, _token_ranges.size() - 1))
|
|
, _end_idx(_range_idx + _token_ranges.size())
|
|
{
|
|
tlogger.debug("Generating token ranges starting from base range {} of {}", _range_idx, _token_ranges.size());
|
|
}
|
|
|
|
// Return the next token_range owned by this shard, or nullopt when the
|
|
// iteration ends.
|
|
std::optional<dht::token_range> next() {
|
|
// We may need three or more iterations in the following loop if a
|
|
// vnode doesn't intersect with the given shard at all (such a small
|
|
// vnode is unlikely, but possible). The loop cannot be infinite
|
|
// because each iteration of the loop advances _range_idx.
|
|
for (;;) {
|
|
if (_intersecter) {
|
|
std::optional<dht::token_range> ret = _intersecter->next();
|
|
if (ret) {
|
|
return ret;
|
|
}
|
|
// done with this range, go to next one
|
|
++_range_idx;
|
|
_intersecter = std::nullopt;
|
|
}
|
|
if (_range_idx == _end_idx) {
|
|
return std::nullopt;
|
|
}
|
|
// If should_skip(), the range should be skipped. This happens for
|
|
// a secondary range whose primary owning node is still alive.
|
|
while (_token_ranges.should_skip(_range_idx % _token_ranges.size())) {
|
|
++_range_idx;
|
|
if (_range_idx == _end_idx) {
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
_intersecter.emplace(_s->get_sharder(), _token_ranges[_range_idx % _token_ranges.size()], this_shard_id());
|
|
}
|
|
}
|
|
|
|
// Same as next(), just return a partition_range instead of token_range
|
|
std::optional<dht::partition_range> next_partition_range() {
|
|
std::optional<dht::token_range> ret = next();
|
|
if (ret) {
|
|
return dht::to_partition_range(*ret);
|
|
} else {
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
};
|
|
|
|
// Precomputed information needed to perform a scan on partition ranges
|
|
struct scan_ranges_context {
|
|
schema_ptr s;
|
|
bytes column_name;
|
|
std::optional<std::string> member;
|
|
|
|
::shared_ptr<cql3::selection::selection> selection;
|
|
std::unique_ptr<service::query_state> query_state_ptr;
|
|
std::unique_ptr<cql3::query_options> query_options;
|
|
::lw_shared_ptr<query::read_command> command;
|
|
|
|
scan_ranges_context(schema_ptr s, service::storage_proxy& proxy, bytes column_name, std::optional<std::string> member)
|
|
: s(s)
|
|
, column_name(column_name)
|
|
, member(member)
|
|
{
|
|
// FIXME: don't read the entire items - read only parts of it.
|
|
// We must read the key columns (to be able to delete) and also
|
|
// the requested attribute. If the requested attribute is a map's
|
|
// member we may be forced to read the entire map - but it would
|
|
// be good if we can read only the single item of the map - it
|
|
// should be possible (and a must for issue #7751!).
|
|
lw_shared_ptr<service::pager::paging_state> paging_state = nullptr;
|
|
auto regular_columns = boost::copy_range<query::column_id_vector>(
|
|
s->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
|
selection = cql3::selection::selection::wildcard(s);
|
|
query::partition_slice::option_set opts = selection->get_query_options();
|
|
opts.set<query::partition_slice::option::allow_short_read>();
|
|
// It is important that the scan bypass cache to avoid polluting it:
|
|
opts.set<query::partition_slice::option::bypass_cache>();
|
|
std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
|
|
auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
|
|
command = ::make_lw_shared<query::read_command>(s->id(), s->version(), partition_slice, proxy.get_max_result_size(partition_slice));
|
|
executor::client_state client_state{executor::client_state::internal_tag()};
|
|
tracing::trace_state_ptr trace_state;
|
|
// NOTICE: empty_service_permit is used because the TTL service has fixed parallelism
|
|
query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, empty_service_permit());
|
|
// FIXME: What should we do on multi-DC? Will we run the expiration on the same ranges on all
|
|
// DCs or only once for each range? If the latter, we need to change the CLs in the
|
|
// scanner and deleter.
|
|
db::consistency_level cl = db::consistency_level::LOCAL_QUORUM;
|
|
query_options = std::make_unique<cql3::query_options>(cl, std::vector<cql3::raw_value>{});
|
|
query_options = std::make_unique<cql3::query_options>(std::move(query_options), std::move(paging_state));
|
|
}
|
|
};
|
|
|
|
// Scan data in a list of token ranges in one table, looking for expired
|
|
// items and deleting them.
|
|
// Because of issue #9167, partition_ranges must have a single partition
|
|
// range for this code to work correctly.
|
|
static future<> scan_table_ranges(
|
|
service::storage_proxy& proxy,
|
|
const scan_ranges_context& scan_ctx,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
abort_source& abort_source,
|
|
named_semaphore& page_sem,
|
|
expiration_service::stats& expiration_stats)
|
|
{
|
|
const schema_ptr& s = scan_ctx.s;
|
|
assert (partition_ranges.size() == 1); // otherwise issue #9167 will cause incorrect results.
|
|
auto p = service::pager::query_pagers::pager(proxy, s, scan_ctx.selection, *scan_ctx.query_state_ptr,
|
|
*scan_ctx.query_options, scan_ctx.command, std::move(partition_ranges), nullptr);
|
|
while (!p->is_exhausted()) {
|
|
if (abort_source.abort_requested()) {
|
|
co_return;
|
|
}
|
|
auto units = co_await get_units(page_sem, 1);
|
|
// We don't to limit page size in number of rows because there is a
|
|
// builtin limit of the page's size in bytes. Setting this limit to 1
|
|
// is useful for debugging the paging code with moderate-size data.
|
|
uint32_t limit = std::numeric_limits<uint32_t>::max();
|
|
// FIXME: which timeout?
|
|
// FIXME: if read times out, need to retry it.
|
|
std::unique_ptr<cql3::result_set> rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
|
|
auto rows = rs->rows();
|
|
auto meta = rs->get_metadata().get_names();
|
|
std::optional<unsigned> expiration_column;
|
|
for (unsigned i = 0; i < meta.size(); i++) {
|
|
const cql3::column_specification& col = *meta[i];
|
|
if (col.name->name() == scan_ctx.column_name) {
|
|
expiration_column = i;
|
|
break;
|
|
}
|
|
}
|
|
if (!expiration_column) {
|
|
continue;
|
|
}
|
|
for (const auto& row : rows) {
|
|
const bytes_opt& cell = row[*expiration_column];
|
|
if (!cell) {
|
|
continue;
|
|
}
|
|
auto v = meta[*expiration_column]->type->deserialize(*cell);
|
|
bool expired = false;
|
|
// FIXME: don't recalculate "now" all the time
|
|
auto now = gc_clock::now();
|
|
if (scan_ctx.member) {
|
|
// In this case, the expiration-time attribute we're
|
|
// looking for is a member in a map, saved serialized
|
|
// into bytes using Alternator's serialization (basically
|
|
// a JSON serialized into bytes)
|
|
// FIXME: is it possible to find a specific member of a map
|
|
// without iterating through it like we do here and compare
|
|
// the key?
|
|
for (const auto& entry : value_cast<map_type_impl::native_type>(v)) {
|
|
std::string attr_name = value_cast<sstring>(entry.first);
|
|
if (value_cast<sstring>(entry.first) == *scan_ctx.member) {
|
|
bytes value = value_cast<bytes>(entry.second);
|
|
rjson::value json = deserialize_item(value);
|
|
expired = is_expired(json, now);
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
// For a real column to contain an expiration time, it
|
|
// must be a numeric type.
|
|
// FIXME: Currently we only support decimal_type (which is
|
|
// what Alternator uses), but other numeric types can be
|
|
// supported as well to make this feature more useful in CQL.
|
|
// Note that kind::decimal is also checked above.
|
|
big_decimal n = value_cast<big_decimal>(v);
|
|
expired = is_expired(n, now);
|
|
}
|
|
if (expired) {
|
|
expiration_stats.items_deleted++;
|
|
// FIXME: maybe don't recalculate new_timestamp() all the time
|
|
// FIXME: if expire_item() throws on timeout, we need to retry it.
|
|
auto ts = api::new_timestamp();
|
|
co_await expire_item(proxy, *scan_ctx.query_state_ptr, row, s, ts);
|
|
}
|
|
}
|
|
// FIXME: once in a while, persist p->state(), so on reboot
|
|
// we don't start from scratch.
|
|
}
|
|
}
|
|
|
|
// scan_table() scans, in one table, data "owned" by this shard, looking for
|
|
// expired items and deleting them.
|
|
// We consider each node to "own" its primary token ranges, i.e., the tokens
|
|
// that this node is their first replica in the ring. Inside the node, each
|
|
// shard "owns" subranges of the node's token ranges - according to the node's
|
|
// sharding algorithm.
|
|
// When a node goes down, the token ranges owned by it will not be scanned
|
|
// and items in those token ranges will not expire, so in the future (FIXME)
|
|
// this function should additionally work on token ranges whose primary owner
|
|
// is down and this node is the range's secondary owner.
|
|
// If the TTL (expiration-time scanning) feature is not enabled for this
|
|
// table, scan_table() returns false without doing anything. Remember that the
|
|
// TTL feature may be enabled later so this function will need to be called
|
|
// again when the feature is enabled.
|
|
// Currently this function scans the entire table (or, rather the parts owned
|
|
// by this shard) at full rate, once. In the future (FIXME) we should consider
|
|
// how to pace this scan, how and when to repeat it, how to interleave or
|
|
// parallelize scanning of multiple tables, and how to continue scans after a
|
|
// reboot.
|
|
static future<bool> scan_table(
|
|
service::storage_proxy& proxy,
|
|
data_dictionary::database db,
|
|
schema_ptr s,
|
|
abort_source& abort_source,
|
|
named_semaphore& page_sem,
|
|
expiration_service::stats& expiration_stats)
|
|
{
|
|
// Check if an expiration-time attribute is enabled for this table.
|
|
// If not, just return false immediately.
|
|
// FIXME: the setting of the TTL may change in the middle of a long scan!
|
|
std::optional<std::string> attribute_name = find_tag(*s, TTL_TAG_KEY);
|
|
if (!attribute_name) {
|
|
co_return false;
|
|
}
|
|
// attribute_name may be one of the schema's columns (in Alternator, this
|
|
// means it's a key column), or an element in Alternator's attrs map
|
|
// encoded in Alternator's JSON encoding.
|
|
// FIXME: To make this less Alternators-specific, we should encode in the
|
|
// single key's value three things:
|
|
// 1. The name of a column
|
|
// 2. Optionally if column is a map, a member in the map
|
|
// 3. The deserializer for the value: CQL or Alternator (JSON).
|
|
// The deserializer can be guessed: If the given column or map item is
|
|
// numeric, it can be used directly. If it is a "bytes" type, it needs to
|
|
// be deserialized using Alternator's deserializer.
|
|
bytes column_name = to_bytes(*attribute_name);
|
|
const column_definition *cd = s->get_column_definition(column_name);
|
|
std::optional<std::string> member;
|
|
if (!cd) {
|
|
member = std::move(attribute_name);
|
|
column_name = bytes(executor::ATTRS_COLUMN_NAME);
|
|
cd = s->get_column_definition(column_name);
|
|
tlogger.info("table {} TTL enabled with attribute {} in {}", s->cf_name(), *member, executor::ATTRS_COLUMN_NAME);
|
|
} else {
|
|
tlogger.info("table {} TTL enabled with attribute {}", s->cf_name(), *attribute_name);
|
|
}
|
|
if (!cd) {
|
|
tlogger.info("table {} TTL column is missing, not scanning", s->cf_name());
|
|
co_return false;
|
|
}
|
|
data_type column_type = cd->type;
|
|
// Verify that the column has the right type: If "member" exists
|
|
// the column must be a map, and if it doesn't, the column must
|
|
// (currently) be a decimal_type. If the column has the wrong type
|
|
// nothing can get expired in this table, and it's pointless to
|
|
// scan it.
|
|
if ((member && column_type->get_kind() != abstract_type::kind::map) ||
|
|
(!member && column_type->get_kind() != abstract_type::kind::decimal)) {
|
|
tlogger.info("table {} TTL column has unsupported type, not scanning", s->cf_name());
|
|
co_return false;
|
|
}
|
|
expiration_stats.scan_table++;
|
|
// FIXME: need to pace the scan, not do it all at once.
|
|
scan_ranges_context scan_ctx{s, proxy, std::move(column_name), std::move(member)};
|
|
token_ranges_owned_by_this_shard<primary> my_ranges(db.real_database(), proxy.gossiper(), s);
|
|
while (std::optional<dht::partition_range> range = my_ranges.next_partition_range()) {
|
|
// Note that because of issue #9167 we need to run a separate
|
|
// query on each partition range, and can't pass several of
|
|
// them into one partition_range_vector.
|
|
dht::partition_range_vector partition_ranges;
|
|
partition_ranges.push_back(std::move(*range));
|
|
// FIXME: if scanning a single range fails, including network errors,
|
|
// we fail the entire scan (and rescan from the beginning). Need to
|
|
// reconsider this. Saving the scan position might be a good enough
|
|
// solution for this problem.
|
|
co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
|
|
}
|
|
// If each node only scans its own primary ranges, then when any node is
|
|
// down part of the token range will not get scanned. This can be viewed
|
|
// as acceptable (when the comes back online, it will resume its scan),
|
|
// but as noted in issue #9787, we can allow more prompt expiration
|
|
// by tasking another node to take over scanning of the dead node's primary
|
|
// ranges. What we do here is that this node will also check expiration
|
|
// on its *secondary* ranges - but only those whose primary owner is down.
|
|
token_ranges_owned_by_this_shard<secondary> my_secondary_ranges(db.real_database(), proxy.gossiper(), s);
|
|
while (std::optional<dht::partition_range> range = my_secondary_ranges.next_partition_range()) {
|
|
expiration_stats.secondary_ranges_scanned++;
|
|
dht::partition_range_vector partition_ranges;
|
|
partition_ranges.push_back(std::move(*range));
|
|
co_await scan_table_ranges(proxy, scan_ctx, std::move(partition_ranges), abort_source, page_sem, expiration_stats);
|
|
}
|
|
co_return true;
|
|
}
|
|
|
|
|
|
future<> expiration_service::run() {
|
|
// FIXME: don't just tight-loop, think about timing, pace, and
|
|
// store position in durable storage, etc.
|
|
// FIXME: think about working on different tables in parallel.
|
|
// also need to notice when a new table is added, a table is
|
|
// deleted or when ttl is enabled or disabled for a table!
|
|
for (;;) {
|
|
auto start = lowres_clock::now();
|
|
// _db.tables() may change under our feet during a
|
|
// long-living loop, so we must keep our own copy of the list of
|
|
// schemas.
|
|
std::vector<schema_ptr> schemas;
|
|
for (auto cf : _db.get_tables()) {
|
|
schemas.push_back(cf.schema());
|
|
}
|
|
for (schema_ptr s : schemas) {
|
|
co_await coroutine::maybe_yield();
|
|
if (shutting_down()) {
|
|
co_return;
|
|
}
|
|
try {
|
|
co_await scan_table(_proxy, _db, s, _abort_source, _page_sem, _expiration_stats);
|
|
} catch (...) {
|
|
// The scan of a table may fail in the middle for many
|
|
// reasons, including network failure and even the table
|
|
// being removed. We'll continue scanning this table later
|
|
// (if it still exists). In any case it's important to catch
|
|
// the exception and not let the scanning service die for
|
|
// good.
|
|
// If the table has been deleted, it is expected that the scan
|
|
// will fail at some point, and even a warning is excessive.
|
|
if (_db.has_schema(s->ks_name(), s->cf_name())) {
|
|
tlogger.warn("table {}.{} expiration scan failed: {}",
|
|
s->ks_name(), s->cf_name(), std::current_exception());
|
|
} else {
|
|
tlogger.info("expiration scan failed when table {}.{} was deleted",
|
|
s->ks_name(), s->cf_name());
|
|
}
|
|
}
|
|
}
|
|
_expiration_stats.scan_passes++;
|
|
// The TTL scanner runs above once over all tables, at full steam.
|
|
// After completing such a scan, we sleep until it's time start
|
|
// another scan. TODO: If the scan went too fast, we can slow it down
|
|
// in the next iteration by reducing the scanner's scheduling-group
|
|
// share (if using a separate scheduling group), or introduce
|
|
// finer-grain sleeps into the scanning code.
|
|
std::chrono::seconds scan_duration(std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start));
|
|
std::chrono::seconds period(_db.get_config().alternator_ttl_period_in_seconds());
|
|
if (scan_duration < period) {
|
|
try {
|
|
tlogger.info("sleeping {} seconds until next period", (period - scan_duration).count());
|
|
co_await seastar::sleep_abortable(period - scan_duration, _abort_source);
|
|
} catch(seastar::sleep_aborted&) {}
|
|
}
|
|
}
|
|
}
|
|
|
|
future<> expiration_service::start() {
|
|
// Called by main() on each shard to start the expiration-service
|
|
// thread. Just runs run() in the background and allows stop().
|
|
if (_db.features().cluster_supports_alternator_ttl()) {
|
|
if (!shutting_down()) {
|
|
_end = run().handle_exception([] (std::exception_ptr ep) {
|
|
tlogger.error("expiration_service failed: {}", ep);
|
|
});
|
|
}
|
|
}
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> expiration_service::stop() {
|
|
if (_abort_source.abort_requested()) {
|
|
throw std::logic_error("expiration_service::stop() called a second time");
|
|
}
|
|
_abort_source.request_abort();
|
|
if (!_end) {
|
|
// if _end is was not set, start() was never called
|
|
return make_ready_future<>();
|
|
}
|
|
return std::move(*_end);
|
|
}
|
|
|
|
expiration_service::stats::stats() {
|
|
_metrics.add_group("expiration", {
|
|
seastar::metrics::make_total_operations("scan_passes", scan_passes,
|
|
seastar::metrics::description("number of passes over the database")),
|
|
seastar::metrics::make_total_operations("scan_table", scan_table,
|
|
seastar::metrics::description("number of table scans (counting each scan of each table that enabled expiration)")),
|
|
seastar::metrics::make_total_operations("items_deleted", items_deleted,
|
|
seastar::metrics::description("number of items deleted after expiration")),
|
|
seastar::metrics::make_total_operations("secondary_ranges_scanned", secondary_ranges_scanned,
|
|
seastar::metrics::description("number of token ranges scanned by this node while their primary owner was down")),
|
|
});
|
|
}
|
|
|
|
|
|
} // namespace alternator
|