Compare commits
145 Commits
copilot/in
...
copilot/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5515d5fb7d | ||
|
|
328c263aed | ||
|
|
a6d36a480d | ||
|
|
85354ae26a | ||
|
|
786fa68faa | ||
|
|
a57f781852 | ||
|
|
7f79b90e91 | ||
|
|
175b8a8a5e | ||
|
|
c2ef8075ee | ||
|
|
9dff9752b4 | ||
|
|
1b92b140ee | ||
|
|
f70ca9a406 | ||
|
|
5557770b59 | ||
|
|
64b1798513 | ||
|
|
e2333a57ad | ||
|
|
cd4caed3d3 | ||
|
|
067bb5f888 | ||
|
|
d5684b98c8 | ||
|
|
0add130b9d | ||
|
|
1d5b8cc562 | ||
|
|
5a5eb67144 | ||
|
|
6b02b50e3d | ||
|
|
e463d528fe | ||
|
|
0c7f499750 | ||
|
|
9ab3d5b946 | ||
|
|
dcd8de86ee | ||
|
|
6ae58c6fa6 | ||
|
|
c5dc086baf | ||
|
|
9ccc95808f | ||
|
|
e4c42acd8f | ||
|
|
1c764cf6ea | ||
|
|
e08ac60161 | ||
|
|
eefe66b2b2 | ||
|
|
54dca90e8c | ||
|
|
1293b94039 | ||
|
|
a4c389413c | ||
|
|
75d4bc26d3 | ||
|
|
321d4caf0c | ||
|
|
24972da26d | ||
|
|
e8efcae991 | ||
|
|
d01915131a | ||
|
|
92bc5568c5 | ||
|
|
6c04e02f66 | ||
|
|
6f88c0dbd3 | ||
|
|
c96420c015 | ||
|
|
a4a0d75eee | ||
|
|
a2e1293f86 | ||
|
|
7e90ed657c | ||
|
|
525cb5b3eb | ||
|
|
d399a197f5 | ||
|
|
eb5a564df2 | ||
|
|
9df426d2ae | ||
|
|
f33f324f77 | ||
|
|
0bf4c68af5 | ||
|
|
66bef0ed36 | ||
|
|
27a5502f14 | ||
|
|
c9d192c684 | ||
|
|
22c3d8d609 | ||
|
|
b4b9b547ce | ||
|
|
45115415fb | ||
|
|
26372e65df | ||
|
|
7ec710c250 | ||
|
|
fae71f79c2 | ||
|
|
e4f2b62019 | ||
|
|
c36623baad | ||
|
|
de4e5e10af | ||
|
|
58a662b9db | ||
|
|
f1bc17bd4c | ||
|
|
dee868b71a | ||
|
|
45d824e0fe | ||
|
|
b637e17b19 | ||
|
|
8e71a6f52a | ||
|
|
3417d50add | ||
|
|
c69534504c | ||
|
|
828f2fbdb1 | ||
|
|
9f2b97bef4 | ||
|
|
f5a212e91e | ||
|
|
0c76c73e34 | ||
|
|
7d6f734a51 | ||
|
|
4026b54a5e | ||
|
|
797c5cd401 | ||
|
|
65eec6d8e7 | ||
|
|
c30607d80b | ||
|
|
08dc1008ba | ||
|
|
95ee4a562c | ||
|
|
2e12b83366 | ||
|
|
0cf20fa15a | ||
|
|
1be80c9e86 | ||
|
|
4221d9bbfd | ||
|
|
3bfd47da4b | ||
|
|
9217f85e99 | ||
|
|
e66bf4a6f5 | ||
|
|
b9db3c9c75 | ||
|
|
af0b5d0894 | ||
|
|
0c443d5764 | ||
|
|
5b740afe9a | ||
|
|
c5a1f44731 | ||
|
|
36167a155e | ||
|
|
196f7cad93 | ||
|
|
bce43c6b20 | ||
|
|
c44ad31d44 | ||
|
|
d33d38139f | ||
|
|
2454de4f8f | ||
|
|
e14eca46af | ||
|
|
741969cf4c | ||
|
|
c11eb73a59 | ||
|
|
a059798de9 | ||
|
|
a23e503e7b | ||
|
|
9d9184e5b7 | ||
|
|
7eedf50c12 | ||
|
|
10996bd0fb | ||
|
|
03c4e4bb10 | ||
|
|
070d0bfc4c | ||
|
|
3b98451776 | ||
|
|
0376d16ad3 | ||
|
|
c785d242a7 | ||
|
|
ffe3262e8d | ||
|
|
06f88b43e5 | ||
|
|
df73f723a6 | ||
|
|
e39f4b399c | ||
|
|
75e25493c1 | ||
|
|
8e9c7397c5 | ||
|
|
e21ecf69de | ||
|
|
4e32502bb3 | ||
|
|
af0889d194 | ||
|
|
2a3a56850c | ||
|
|
f187dceb1a | ||
|
|
875fd03882 | ||
|
|
94176f7477 | ||
|
|
6665cda23f | ||
|
|
44715a2d45 | ||
|
|
dcbb5cb45b | ||
|
|
73512a59ff | ||
|
|
a4bd9037b3 | ||
|
|
63cafab56c | ||
|
|
1b1aae8a0d | ||
|
|
70988f9b61 | ||
|
|
98b4092153 | ||
|
|
1ac1e90b16 | ||
|
|
41930c0176 | ||
|
|
56e40e90c9 | ||
|
|
39492596c2 | ||
|
|
2c471ec57a | ||
|
|
30d4d3248d | ||
|
|
a0a6140436 |
@@ -43,7 +43,7 @@ For further information, please see:
|
||||
|
||||
[developer documentation]: HACKING.md
|
||||
[build documentation]: docs/dev/building.md
|
||||
[docker image build documentation]: dist/docker/debian/README.md
|
||||
[docker image build documentation]: dist/docker/redhat/README.md
|
||||
|
||||
## Running Scylla
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <fmt/ranges.h>
|
||||
#include <cstdlib>
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include "alternator/executor.hh"
|
||||
#include "alternator/consumed_capacity.hh"
|
||||
@@ -108,6 +109,16 @@ const sstring TABLE_CREATION_TIME_TAG_KEY("system:table_creation_time");
|
||||
// configured by UpdateTimeToLive to be the expiration-time attribute for
|
||||
// this table.
|
||||
extern const sstring TTL_TAG_KEY("system:ttl_attribute");
|
||||
// If this tag is present, it stores the name of an attribute whose numeric
|
||||
// value (in microseconds since the Unix epoch) is used as the write timestamp
|
||||
// for PutItem and UpdateItem operations. When the named attribute is present
|
||||
// in a PutItem or UpdateItem request, its value is used as the timestamp of
|
||||
// the write, and the attribute itself is NOT stored in the item. This allows
|
||||
// users to control write ordering for last-write-wins semantics. Because LWT
|
||||
// does not allow setting a custom write timestamp, operations using this
|
||||
// feature are incompatible with conditions (which require LWT), and with
|
||||
// the LWT_ALWAYS write isolation mode; such operations are rejected.
|
||||
static const sstring TIMESTAMP_TAG_KEY("system:timestamp_attribute");
|
||||
// This will be set to 1 in a case, where user DID NOT specify a range key.
|
||||
// The way GSI / LSI is implemented by Alternator assumes user specified keys will come first
|
||||
// in materialized view's key list. Then, if needed missing keys are added (current implementation
|
||||
@@ -1337,13 +1348,14 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
|
||||
// Alternator uses tags whose keys start with the "system:" prefix for
|
||||
// internal purposes. Those should not be readable by ListTagsOfResource,
|
||||
// nor writable with TagResource or UntagResource (see #24098).
|
||||
// Only a few specific system tags, currently only "system:write_isolation"
|
||||
// and "system:initial_tablets", are deliberately intended to be set and read
|
||||
// by the user, so are not considered "internal".
|
||||
// Only a few specific system tags, currently only "system:write_isolation",
|
||||
// "system:initial_tablets", and "system:timestamp_attribute", are deliberately
|
||||
// intended to be set and read by the user, so are not considered "internal".
|
||||
static bool tag_key_is_internal(std::string_view tag_key) {
|
||||
return tag_key.starts_with("system:")
|
||||
&& tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY
|
||||
&& tag_key != INITIAL_TABLETS_TAG_KEY;
|
||||
&& tag_key != INITIAL_TABLETS_TAG_KEY
|
||||
&& tag_key != TIMESTAMP_TAG_KEY;
|
||||
}
|
||||
|
||||
enum class update_tags_action { add_tags, delete_tags };
|
||||
@@ -2298,8 +2310,11 @@ public:
|
||||
// After calling pk_from_json() and ck_from_json() to extract the pk and ck
|
||||
// components of a key, and if that succeeded, call check_key() to further
|
||||
// check that the key doesn't have any spurious components.
|
||||
static void check_key(const rjson::value& key, const schema_ptr& schema) {
|
||||
if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) {
|
||||
// allow_extra_attribute: set to true when the key may contain one extra
|
||||
// non-key attribute (e.g., the timestamp pseudo-attribute for DeleteItem).
|
||||
static void check_key(const rjson::value& key, const schema_ptr& schema, bool allow_extra_attribute = false) {
|
||||
const unsigned expected = (schema->clustering_key_size() == 0 ? 1 : 2) + (allow_extra_attribute ? 1 : 0);
|
||||
if (key.MemberCount() != expected) {
|
||||
throw api_error::validation("Given key attribute not in schema");
|
||||
}
|
||||
}
|
||||
@@ -2346,6 +2361,57 @@ void validate_value(const rjson::value& v, const char* caller) {
|
||||
// any writing happens (if one of the commands has an error, none of the
|
||||
// writes should be done). LWT makes it impossible for the parse step to
|
||||
// generate "mutation" objects, because the timestamp still isn't known.
|
||||
|
||||
// Convert a DynamoDB number (big_decimal) to an api::timestamp_type
|
||||
// (microseconds since the Unix epoch). Fractional microseconds are truncated.
|
||||
// Returns nullopt if the value is negative or zero.
|
||||
static std::optional<api::timestamp_type> bigdecimal_to_timestamp(const big_decimal& bd) {
|
||||
if (bd.unscaled_value() <= 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (bd.scale() == 0) {
|
||||
// Fast path: integer value, no decimal adjustment needed
|
||||
return static_cast<api::timestamp_type>(bd.unscaled_value());
|
||||
}
|
||||
// General case: adjust for decimal scale.
|
||||
// big_decimal stores value as unscaled_value * 10^(-scale).
|
||||
// scale > 0 means divide by 10^scale (truncate fractional part).
|
||||
// scale < 0 means multiply by 10^|scale| (add trailing zeros).
|
||||
auto str = bd.unscaled_value().str();
|
||||
if (bd.scale() > 0) {
|
||||
int len = str.length();
|
||||
if (len <= bd.scale()) {
|
||||
return std::nullopt; // Number < 1
|
||||
}
|
||||
str = str.substr(0, len - bd.scale());
|
||||
} else {
|
||||
if (bd.scale() < -18) {
|
||||
// Too large to represent as int64_t
|
||||
return std::nullopt;
|
||||
}
|
||||
for (int i = 0; i < -bd.scale(); i++) {
|
||||
str.push_back('0');
|
||||
}
|
||||
}
|
||||
long long result = strtoll(str.c_str(), nullptr, 10);
|
||||
if (result <= 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return static_cast<api::timestamp_type>(result);
|
||||
}
|
||||
|
||||
// Try to extract a write timestamp from a DynamoDB-typed value.
|
||||
// The value should be a number ({"N": "..."}), representing microseconds
|
||||
// since the Unix epoch. Returns nullopt if the value is not a valid number
|
||||
// or doesn't represent a valid timestamp.
|
||||
static std::optional<api::timestamp_type> try_get_timestamp(const rjson::value& attr_value) {
|
||||
std::optional<big_decimal> n = try_unwrap_number(attr_value);
|
||||
if (!n) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return bigdecimal_to_timestamp(*n);
|
||||
}
|
||||
|
||||
class put_or_delete_item {
|
||||
private:
|
||||
partition_key _pk;
|
||||
@@ -2361,11 +2427,17 @@ private:
|
||||
// that length can have different meaning depends on the operation but the
|
||||
// the calculation of length in bytes to WCU is the same.
|
||||
uint64_t _length_in_bytes = 0;
|
||||
// If the table has a system:timestamp_attribute tag, and the named
|
||||
// attribute was found in the item with a valid numeric value, this holds
|
||||
// the extracted timestamp. The attribute is not added to _cells.
|
||||
std::optional<api::timestamp_type> _custom_timestamp;
|
||||
public:
|
||||
struct delete_item {};
|
||||
struct put_item {};
|
||||
put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item);
|
||||
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes);
|
||||
put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item,
|
||||
const std::optional<bytes>& timestamp_attribute = std::nullopt);
|
||||
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes,
|
||||
const std::optional<bytes>& timestamp_attribute = std::nullopt);
|
||||
// put_or_delete_item doesn't keep a reference to schema (so it can be
|
||||
// moved between shards for LWT) so it needs to be given again to build():
|
||||
mutation build(schema_ptr schema, api::timestamp_type ts) const;
|
||||
@@ -2380,11 +2452,32 @@ public:
|
||||
bool is_put_item() noexcept {
|
||||
return _cells.has_value();
|
||||
}
|
||||
// Returns the custom write timestamp extracted from the timestamp attribute,
|
||||
// if any. If not set, the caller should use api::new_timestamp() instead.
|
||||
std::optional<api::timestamp_type> custom_timestamp() const noexcept {
|
||||
return _custom_timestamp;
|
||||
}
|
||||
};
|
||||
|
||||
put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item)
|
||||
put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item, const std::optional<bytes>& timestamp_attribute)
|
||||
: _pk(pk_from_json(key, schema)), _ck(ck_from_json(key, schema)) {
|
||||
check_key(key, schema);
|
||||
if (timestamp_attribute) {
|
||||
// The timestamp attribute may be provided as a "pseudo-key": it is
|
||||
// not a real key column, but can be included in the "Key" object to
|
||||
// carry the custom write timestamp. If found, extract the timestamp
|
||||
// and don't store it in the item.
|
||||
const rjson::value* ts_val = rjson::find(key, to_string_view(*timestamp_attribute));
|
||||
if (ts_val) {
|
||||
if (auto t = try_get_timestamp(*ts_val)) {
|
||||
_custom_timestamp = t;
|
||||
} else {
|
||||
throw api_error::validation(fmt::format(
|
||||
"The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)",
|
||||
to_string_view(*timestamp_attribute)));
|
||||
}
|
||||
}
|
||||
}
|
||||
check_key(key, schema, _custom_timestamp.has_value());
|
||||
}
|
||||
|
||||
// find_attribute() checks whether the named attribute is stored in the
|
||||
@@ -2471,7 +2564,8 @@ static inline void validate_value_if_index_key(
|
||||
}
|
||||
}
|
||||
|
||||
put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes)
|
||||
put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes,
|
||||
const std::optional<bytes>& timestamp_attribute)
|
||||
: _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
|
||||
_cells = std::vector<cell>();
|
||||
_cells->reserve(item.MemberCount());
|
||||
@@ -2480,6 +2574,17 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
|
||||
validate_value(it->value, "PutItem");
|
||||
const column_definition* cdef = find_attribute(*schema, column_name);
|
||||
validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
|
||||
// If this is the timestamp attribute, it must be a valid numeric value
|
||||
// (microseconds since epoch). Use it as the write timestamp and do not
|
||||
// store it in the item data. Reject the write if the value is non-numeric.
|
||||
if (timestamp_attribute && column_name == *timestamp_attribute) {
|
||||
if (auto t = try_get_timestamp(it->value)) {
|
||||
_custom_timestamp = t;
|
||||
// The attribute is consumed as timestamp, not stored in _cells.
|
||||
continue;
|
||||
}
|
||||
throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*timestamp_attribute)));
|
||||
}
|
||||
_length_in_bytes += column_name.size();
|
||||
if (!cdef) {
|
||||
// This attribute may be a key column of one of the GSI or LSI,
|
||||
@@ -2671,6 +2776,13 @@ rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& reque
|
||||
// _pk and _ck will be assigned later, by the subclass's constructor
|
||||
// (each operation puts the key in a slightly different location in
|
||||
// the request).
|
||||
const auto tags_ptr = db::get_tags_of_table(_schema);
|
||||
if (tags_ptr) {
|
||||
auto it = tags_ptr->find(TIMESTAMP_TAG_KEY);
|
||||
if (it != tags_ptr->end() && !it->second.empty()) {
|
||||
_timestamp_attribute = to_bytes(it->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) {
|
||||
@@ -2815,6 +2927,21 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
|
||||
.alternator = true,
|
||||
.alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
||||
};
|
||||
// If the operation uses a custom write timestamp (from the
|
||||
// system:timestamp_attribute tag), LWT is incompatible because LWT
|
||||
// requires the timestamp to be set by the Paxos protocol. Reject the
|
||||
// operation if it would need to use LWT.
|
||||
if (has_custom_timestamp()) {
|
||||
bool would_use_lwt = _write_isolation == write_isolation::LWT_ALWAYS ||
|
||||
(needs_read_before_write &&
|
||||
_write_isolation != write_isolation::FORBID_RMW &&
|
||||
_write_isolation != write_isolation::UNSAFE_RMW);
|
||||
if (would_use_lwt) {
|
||||
throw api_error::validation(
|
||||
"Using the system:timestamp_attribute is not compatible with "
|
||||
"conditional writes or the 'always' write isolation policy.");
|
||||
}
|
||||
}
|
||||
if (needs_read_before_write) {
|
||||
if (_write_isolation == write_isolation::FORBID_RMW) {
|
||||
throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
|
||||
@@ -2913,7 +3040,8 @@ public:
|
||||
put_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
|
||||
: rmw_operation(proxy, std::move(request))
|
||||
, _mutation_builder(rjson::get(_request, "Item"), schema(), put_or_delete_item::put_item{},
|
||||
si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name()))) {
|
||||
si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name())),
|
||||
_timestamp_attribute) {
|
||||
_pk = _mutation_builder.pk();
|
||||
_ck = _mutation_builder.ck();
|
||||
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
|
||||
@@ -2945,6 +3073,9 @@ public:
|
||||
check_needs_read_before_write(_condition_expression) ||
|
||||
_returnvalues == returnvalues::ALL_OLD;
|
||||
}
|
||||
bool has_custom_timestamp() const noexcept {
|
||||
return _mutation_builder.custom_timestamp().has_value();
|
||||
}
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
|
||||
if (!verify_expected(_request, previous_item.get()) ||
|
||||
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
||||
@@ -2962,7 +3093,10 @@ public:
|
||||
} else {
|
||||
_return_attributes = {};
|
||||
}
|
||||
return _mutation_builder.build(_schema, ts);
|
||||
// Use the custom timestamp from the timestamp attribute if available,
|
||||
// otherwise use the provided timestamp.
|
||||
api::timestamp_type effective_ts = _mutation_builder.custom_timestamp().value_or(ts);
|
||||
return _mutation_builder.build(_schema, effective_ts);
|
||||
}
|
||||
virtual ~put_item_operation() = default;
|
||||
};
|
||||
@@ -3014,7 +3148,7 @@ public:
|
||||
parsed::condition_expression _condition_expression;
|
||||
delete_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
|
||||
: rmw_operation(proxy, std::move(request))
|
||||
, _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}) {
|
||||
, _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}, _timestamp_attribute) {
|
||||
_pk = _mutation_builder.pk();
|
||||
_ck = _mutation_builder.ck();
|
||||
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
|
||||
@@ -3045,6 +3179,9 @@ public:
|
||||
check_needs_read_before_write(_condition_expression) ||
|
||||
_returnvalues == returnvalues::ALL_OLD;
|
||||
}
|
||||
bool has_custom_timestamp() const noexcept override {
|
||||
return _mutation_builder.custom_timestamp().has_value();
|
||||
}
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
|
||||
if (!verify_expected(_request, previous_item.get()) ||
|
||||
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
||||
@@ -3065,7 +3202,10 @@ public:
|
||||
if (_consumed_capacity._total_bytes == 0) {
|
||||
_consumed_capacity._total_bytes = 1;
|
||||
}
|
||||
return _mutation_builder.build(_schema, ts);
|
||||
// Use the custom timestamp from the timestamp attribute if available,
|
||||
// otherwise use the provided timestamp.
|
||||
api::timestamp_type effective_ts = _mutation_builder.custom_timestamp().value_or(ts);
|
||||
return _mutation_builder.build(_schema, effective_ts);
|
||||
}
|
||||
virtual ~delete_item_operation() = default;
|
||||
};
|
||||
@@ -3252,10 +3392,13 @@ future<> executor::do_batch_write(
|
||||
// Do a normal write, without LWT:
|
||||
utils::chunked_vector<mutation> mutations;
|
||||
mutations.reserve(mutation_builders.size());
|
||||
api::timestamp_type now = api::new_timestamp();
|
||||
api::timestamp_type default_ts = api::new_timestamp();
|
||||
bool any_cdc_enabled = false;
|
||||
for (auto& b : mutation_builders) {
|
||||
mutations.push_back(b.second.build(b.first, now));
|
||||
// Use custom timestamp from the timestamp attribute if available,
|
||||
// otherwise use the default timestamp for all items in this batch.
|
||||
api::timestamp_type ts = b.second.custom_timestamp().value_or(default_ts);
|
||||
mutations.push_back(b.second.build(b.first, ts));
|
||||
any_cdc_enabled |= b.first->cdc_options().enabled();
|
||||
}
|
||||
return _proxy.mutate(std::move(mutations),
|
||||
@@ -3355,6 +3498,16 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
|
||||
std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
|
||||
1, primary_key_hash{schema}, primary_key_equal{schema});
|
||||
// Look up the timestamp attribute tag once per table (shared by all
|
||||
// PutRequests and DeleteRequests for this table).
|
||||
std::optional<bytes> ts_attr;
|
||||
const auto tags_ptr = db::get_tags_of_table(schema);
|
||||
if (tags_ptr) {
|
||||
auto tag_it = tags_ptr->find(TIMESTAMP_TAG_KEY);
|
||||
if (tag_it != tags_ptr->end() && !tag_it->second.empty()) {
|
||||
ts_attr = to_bytes(tag_it->second);
|
||||
}
|
||||
}
|
||||
for (auto& request : it->value.GetArray()) {
|
||||
auto& r = get_single_member(request, "RequestItems element");
|
||||
const auto r_name = rjson::to_string_view(r.name);
|
||||
@@ -3363,7 +3516,8 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
validate_is_object(item, "Item in PutRequest");
|
||||
auto&& put_item = put_or_delete_item(
|
||||
item, schema, put_or_delete_item::put_item{},
|
||||
si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())));
|
||||
si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())),
|
||||
ts_attr);
|
||||
mutation_builders.emplace_back(schema, std::move(put_item));
|
||||
auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
|
||||
if (used_keys.contains(mut_key)) {
|
||||
@@ -3374,7 +3528,7 @@ future<executor::request_return_type> executor::batch_write_item(client_state& c
|
||||
const rjson::value& key = get_member(r.value, "Key", "DeleteRequest");
|
||||
validate_is_object(key, "Key in DeleteRequest");
|
||||
mutation_builders.emplace_back(schema, put_or_delete_item(
|
||||
key, schema, put_or_delete_item::delete_item{}));
|
||||
key, schema, put_or_delete_item::delete_item{}, ts_attr));
|
||||
auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
|
||||
mutation_builders.back().second.ck());
|
||||
if (used_keys.contains(mut_key)) {
|
||||
@@ -3983,6 +4137,10 @@ public:
|
||||
virtual ~update_item_operation() = default;
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override;
|
||||
bool needs_read_before_write() const;
|
||||
// Returns true if the timestamp attribute is being set in this update
|
||||
// (via AttributeUpdates PUT or UpdateExpression SET). Used to detect
|
||||
// whether a custom write timestamp will be used.
|
||||
bool has_custom_timestamp() const noexcept;
|
||||
|
||||
private:
|
||||
void delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
|
||||
@@ -4117,6 +4275,44 @@ update_item_operation::needs_read_before_write() const {
|
||||
(_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
|
||||
}
|
||||
|
||||
bool
|
||||
update_item_operation::has_custom_timestamp() const noexcept {
|
||||
if (!_timestamp_attribute) {
|
||||
return false;
|
||||
}
|
||||
// Check if the timestamp attribute is being set via AttributeUpdates PUT
|
||||
// with a valid numeric value.
|
||||
if (_attribute_updates) {
|
||||
std::string_view ts_attr = to_string_view(*_timestamp_attribute);
|
||||
for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
|
||||
if (rjson::to_string_view(it->name) == ts_attr) {
|
||||
const rjson::value* action = rjson::find(it->value, "Action");
|
||||
if (action && rjson::to_string_view(*action) == "PUT" && it->value.HasMember("Value")) {
|
||||
// Only consider it a custom timestamp if the value is numeric
|
||||
if (try_get_timestamp((it->value)["Value"])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check if the timestamp attribute is being set via UpdateExpression SET.
|
||||
// We can't check the actual value type without resolving the expression
|
||||
// (which requires previous_item), so we conservatively return true if the
|
||||
// attribute appears in a SET action, and handle the non-numeric case in apply().
|
||||
// A non-numeric value will cause apply() to throw a ValidationException.
|
||||
if (!_update_expression.empty()) {
|
||||
std::string ts_attr(to_string_view(*_timestamp_attribute));
|
||||
auto it = _update_expression.find(ts_attr);
|
||||
if (it != _update_expression.end() && it->second.has_value()) {
|
||||
const auto& action = it->second.get_value();
|
||||
return std::holds_alternative<parsed::update_expression::action::set>(action._action);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// action_result() returns the result of applying an UpdateItem action -
|
||||
// this result is either a JSON object or an unset optional which indicates
|
||||
// the action was a deletion. The caller (update_item_operation::apply()
|
||||
@@ -4392,6 +4588,17 @@ inline void update_item_operation::apply_attribute_updates(const std::unique_ptr
|
||||
throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
|
||||
}
|
||||
std::string action = rjson::to_string((it->value)["Action"]);
|
||||
// If this is the timestamp attribute being PUT, it must be a valid
|
||||
// numeric value (microseconds since epoch). Use it as the write
|
||||
// timestamp and skip storing it. Reject if the value is non-numeric.
|
||||
if (_timestamp_attribute && column_name == *_timestamp_attribute && action == "PUT") {
|
||||
if (it->value.HasMember("Value")) {
|
||||
if (try_get_timestamp((it->value)["Value"])) {
|
||||
continue;
|
||||
}
|
||||
throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*_timestamp_attribute)));
|
||||
}
|
||||
}
|
||||
if (action == "DELETE") {
|
||||
// The DELETE operation can do two unrelated tasks. Without a
|
||||
// "Value" option, it is used to delete an attribute. With a
|
||||
@@ -4495,6 +4702,20 @@ inline void update_item_operation::apply_update_expression(const std::unique_ptr
|
||||
if (cdef && cdef->is_primary_key()) {
|
||||
throw api_error::validation(fmt::format("UpdateItem cannot update key column {}", column_name));
|
||||
}
|
||||
// If this is the timestamp attribute being set via UpdateExpression SET,
|
||||
// it must be a valid numeric value (microseconds since epoch). Use it as
|
||||
// the write timestamp and skip storing it. Reject if non-numeric.
|
||||
if (_timestamp_attribute && to_bytes(column_name) == *_timestamp_attribute &&
|
||||
actions.second.has_value() &&
|
||||
std::holds_alternative<parsed::update_expression::action::set>(actions.second.get_value()._action)) {
|
||||
std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
|
||||
if (result) {
|
||||
if (try_get_timestamp(*result)) {
|
||||
continue; // Skip - already used as timestamp
|
||||
}
|
||||
throw api_error::validation(fmt::format("The '{}' attribute used as a write timestamp must be a positive number (microseconds since epoch)", to_string_view(*_timestamp_attribute)));
|
||||
}
|
||||
}
|
||||
if (actions.second.has_value()) {
|
||||
// An action on a top-level attribute column_name. The single
|
||||
// action is actions.second.get_value(). We can simply invoke
|
||||
@@ -4543,6 +4764,44 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
|
||||
return {};
|
||||
}
|
||||
|
||||
// If the table has a timestamp attribute, look for it in the update
|
||||
// (AttributeUpdates PUT or UpdateExpression SET). If found with a valid
|
||||
// numeric value, use it as the write timestamp instead of the provided ts.
|
||||
api::timestamp_type effective_ts = ts;
|
||||
if (_timestamp_attribute) {
|
||||
bool found_ts = false;
|
||||
if (_attribute_updates) {
|
||||
std::string_view ts_attr = to_string_view(*_timestamp_attribute);
|
||||
for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
|
||||
if (rjson::to_string_view(it->name) == ts_attr) {
|
||||
const rjson::value* action = rjson::find(it->value, "Action");
|
||||
if (action && rjson::to_string_view(*action) == "PUT" && it->value.HasMember("Value")) {
|
||||
if (auto t = try_get_timestamp((it->value)["Value"])) {
|
||||
effective_ts = *t;
|
||||
found_ts = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found_ts && !_update_expression.empty()) {
|
||||
std::string ts_attr(to_string_view(*_timestamp_attribute));
|
||||
auto it = _update_expression.find(ts_attr);
|
||||
if (it != _update_expression.end() && it->second.has_value()) {
|
||||
const auto& action = it->second.get_value();
|
||||
if (std::holds_alternative<parsed::update_expression::action::set>(action._action)) {
|
||||
std::optional<rjson::value> result = action_result(action, previous_item.get());
|
||||
if (result) {
|
||||
if (auto t = try_get_timestamp(*result)) {
|
||||
effective_ts = *t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In the ReturnValues=ALL_NEW case, we make a copy of previous_item into
|
||||
// _return_attributes and parts of it will be overwritten by the new
|
||||
// updates (in do_update() and do_delete()). We need to make a copy and
|
||||
@@ -4571,10 +4830,10 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
|
||||
auto& row = m.partition().clustered_row(*_schema, _ck);
|
||||
auto modified_attrs = attribute_collector();
|
||||
if (!_update_expression.empty()) {
|
||||
apply_update_expression(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
|
||||
apply_update_expression(previous_item, effective_ts, row, modified_attrs, any_updates, any_deletes);
|
||||
}
|
||||
if (_attribute_updates) {
|
||||
apply_attribute_updates(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
|
||||
apply_attribute_updates(previous_item, effective_ts, row, modified_attrs, any_updates, any_deletes);
|
||||
}
|
||||
if (!modified_attrs.empty()) {
|
||||
auto serialized_map = modified_attrs.to_mut().serialize(*attrs_type());
|
||||
@@ -4585,7 +4844,7 @@ std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::valu
|
||||
// marker. An update with only DELETE operations must not add a row marker
|
||||
// (this was issue #5862) but any other update, even an empty one, should.
|
||||
if (any_updates || !any_deletes) {
|
||||
row.apply(row_marker(ts));
|
||||
row.apply(row_marker(effective_ts));
|
||||
} else if (_returnvalues == returnvalues::ALL_NEW && !previous_item) {
|
||||
// There was no pre-existing item, and we're not creating one, so
|
||||
// don't report the new item in the returned Attributes.
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "executor.hh"
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "keys/keys.hh"
|
||||
#include "bytes.hh"
|
||||
|
||||
namespace alternator {
|
||||
|
||||
@@ -72,6 +73,11 @@ protected:
|
||||
clustering_key _ck = clustering_key::make_empty();
|
||||
write_isolation _write_isolation;
|
||||
mutable wcu_consumed_capacity_counter _consumed_capacity;
|
||||
// If the table has a "system:timestamp_attribute" tag, this holds the
|
||||
// name of the attribute (converted to bytes) whose numeric value should
|
||||
// be used as the write timestamp instead of the current time. The
|
||||
// attribute itself is NOT stored in the item data.
|
||||
std::optional<bytes> _timestamp_attribute;
|
||||
// All RMW operations can have a ReturnValues parameter from the following
|
||||
// choices. But note that only UpdateItem actually supports all of them:
|
||||
enum class returnvalues {
|
||||
@@ -113,6 +119,9 @@ public:
|
||||
// Convert the above apply() into the signature needed by cas_request:
|
||||
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override;
|
||||
virtual ~rmw_operation() = default;
|
||||
// Returns true if the operation will use a custom write timestamp (from the
|
||||
// system:timestamp_attribute tag). Subclasses override this as needed.
|
||||
virtual bool has_custom_timestamp() const noexcept { return false; }
|
||||
const wcu_consumed_capacity_counter& consumed_capacity() const noexcept { return _consumed_capacity; }
|
||||
schema_ptr schema() const { return _schema; }
|
||||
const rjson::value& request() const { return _request; }
|
||||
|
||||
@@ -767,7 +767,7 @@ static future<bool> scan_table(
|
||||
// by tasking another node to take over scanning of the dead node's primary
|
||||
// ranges. What we do here is that this node will also check expiration
|
||||
// on its *secondary* ranges - but only those whose primary owner is down.
|
||||
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet); // throws if no secondary replica
|
||||
auto tablet_secondary_replica = tablet_map.get_secondary_replica(*tablet, erm->get_topology()); // throws if no secondary replica
|
||||
if (tablet_secondary_replica.host == my_host_id && tablet_secondary_replica.shard == this_shard_id()) {
|
||||
if (!gossiper.is_alive(tablet_primary_replica.host)) {
|
||||
co_await scan_tablet(*tablet, proxy, abort_source, page_sem, expiration_stats, scan_ctx, tablet_map);
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"operations":[
|
||||
{
|
||||
"method":"POST",
|
||||
"summary":"Reset cache",
|
||||
"summary":"Resets authorized prepared statements cache",
|
||||
"type":"void",
|
||||
"nickname":"authorization_cache_reset",
|
||||
"produces":[
|
||||
|
||||
25
api/api.hh
25
api/api.hh
@@ -23,31 +23,6 @@
|
||||
|
||||
namespace api {
|
||||
|
||||
template<class T>
|
||||
std::vector<T> map_to_key_value(const std::map<sstring, sstring>& map) {
|
||||
std::vector<T> res;
|
||||
res.reserve(map.size());
|
||||
|
||||
for (const auto& [key, value] : map) {
|
||||
res.push_back(T());
|
||||
res.back().key = key;
|
||||
res.back().value = value;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template<class T, class MAP>
|
||||
std::vector<T>& map_to_key_value(const MAP& map, std::vector<T>& res) {
|
||||
res.reserve(res.size() + std::size(map));
|
||||
|
||||
for (const auto& [key, value] : map) {
|
||||
T val;
|
||||
val.key = fmt::to_string(key);
|
||||
val.value = fmt::to_string(value);
|
||||
res.push_back(val);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
template <typename T, typename S = T>
|
||||
T map_sum(T&& dest, const S& src) {
|
||||
for (const auto& i : src) {
|
||||
|
||||
@@ -536,13 +536,15 @@ void unset_sstables_loader(http_context& ctx, routes& r) {
|
||||
}
|
||||
|
||||
void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_builder>& vb, sharded<gms::gossiper>& g) {
|
||||
ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) {
|
||||
ss::view_build_statuses.set(r, [&ctx, &vb, &g] (std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
auto keyspace = validate_keyspace(ctx, req);
|
||||
auto view = req->get_path_param("view");
|
||||
return vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()).then([] (std::unordered_map<sstring, sstring> status) {
|
||||
std::vector<storage_service_json::mapper> res;
|
||||
return make_ready_future<json::json_return_type>(map_to_key_value(std::move(status), res));
|
||||
});
|
||||
co_return json::json_return_type(stream_range_as_array(co_await vb.local().view_build_statuses(std::move(keyspace), std::move(view), g.local()), [] (const auto& i) {
|
||||
storage_service_json::mapper res;
|
||||
res.key = i.first;
|
||||
res.value = i.second;
|
||||
return res;
|
||||
}));
|
||||
});
|
||||
|
||||
cf::get_built_indexes.set(r, [&vb](std::unique_ptr<http::request> req) -> future<json::json_return_type> {
|
||||
@@ -580,6 +582,16 @@ static future<json::json_return_type> describe_ring_as_json_for_table(const shar
|
||||
co_return json::json_return_type(stream_range_as_array(co_await ss.local().describe_ring_for_table(keyspace, table), token_range_endpoints_to_json));
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename Key, typename Value>
|
||||
storage_service_json::mapper map_to_json(const std::pair<Key, Value>& i) {
|
||||
storage_service_json::mapper val;
|
||||
val.key = fmt::to_string(i.first);
|
||||
val.value = fmt::to_string(i.second);
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
future<json::json_return_type>
|
||||
rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss, std::unique_ptr<http::request> req) {
|
||||
@@ -597,12 +609,7 @@ rest_get_token_endpoint(http_context& ctx, sharded<service::storage_service>& ss
|
||||
throw bad_param_exception("Either provide both keyspace and table (for tablet table) or neither (for vnodes)");
|
||||
}
|
||||
|
||||
co_return json::json_return_type(stream_range_as_array(token_endpoints, [](const auto& i) {
|
||||
storage_service_json::mapper val;
|
||||
val.key = fmt::to_string(i.first);
|
||||
val.value = fmt::to_string(i.second);
|
||||
return val;
|
||||
}));
|
||||
co_return json::json_return_type(stream_range_as_array(token_endpoints, &map_to_json<dht::token, gms::inet_address>));
|
||||
}
|
||||
|
||||
static
|
||||
@@ -686,7 +693,6 @@ rest_get_range_to_endpoint_map(http_context& ctx, sharded<service::storage_servi
|
||||
table_id = validate_table(ctx.db.local(), keyspace, table);
|
||||
}
|
||||
|
||||
std::vector<ss::maplist_mapper> res;
|
||||
co_return stream_range_as_array(co_await ss.local().get_range_to_address_map(keyspace, table_id),
|
||||
[](const std::pair<dht::token_range, inet_address_vector_replica_set>& entry){
|
||||
ss::maplist_mapper m;
|
||||
@@ -1317,10 +1323,7 @@ rest_get_ownership(http_context& ctx, sharded<service::storage_service>& ss, std
|
||||
throw httpd::bad_param_exception("storage_service/ownership cannot be used when a keyspace uses tablets");
|
||||
}
|
||||
|
||||
return ss.local().get_ownership().then([] (auto&& ownership) {
|
||||
std::vector<storage_service_json::mapper> res;
|
||||
return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
|
||||
});
|
||||
co_return json::json_return_type(stream_range_as_array(co_await ss.local().get_ownership(), &map_to_json<gms::inet_address, float>));
|
||||
}
|
||||
|
||||
static
|
||||
@@ -1337,10 +1340,7 @@ rest_get_effective_ownership(http_context& ctx, sharded<service::storage_service
|
||||
}
|
||||
}
|
||||
|
||||
return ss.local().effective_ownership(keyspace_name, table_name).then([] (auto&& ownership) {
|
||||
std::vector<storage_service_json::mapper> res;
|
||||
return make_ready_future<json::json_return_type>(map_to_key_value(ownership, res));
|
||||
});
|
||||
co_return json::json_return_type(stream_range_as_array(co_await ss.local().effective_ownership(keyspace_name, table_name), &map_to_json<gms::inet_address, float>));
|
||||
}
|
||||
|
||||
static
|
||||
@@ -1350,7 +1350,7 @@ rest_estimate_compression_ratios(http_context& ctx, sharded<service::storage_ser
|
||||
apilog.warn("estimate_compression_ratios: called before the cluster feature was enabled");
|
||||
throw std::runtime_error("estimate_compression_ratios requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
|
||||
}
|
||||
auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
|
||||
auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
|
||||
auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
|
||||
auto cf = api::req_param<sstring>(*req, "cf", {}).value;
|
||||
apilog.debug("estimate_compression_ratios: called with ks={} cf={}", ks, cf);
|
||||
@@ -1416,7 +1416,7 @@ rest_retrain_dict(http_context& ctx, sharded<service::storage_service>& ss, serv
|
||||
apilog.warn("retrain_dict: called before the cluster feature was enabled");
|
||||
throw std::runtime_error("retrain_dict requires all nodes to support the SSTABLE_COMPRESSION_DICTS cluster feature");
|
||||
}
|
||||
auto ticket = get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
|
||||
auto ticket = co_await get_units(ss.local().get_do_sample_sstables_concurrency_limiter(), 1);
|
||||
auto ks = api::req_param<sstring>(*req, "keyspace", {}).value;
|
||||
auto cf = api::req_param<sstring>(*req, "cf", {}).value;
|
||||
apilog.debug("retrain_dict: called with ks={} cf={}", ks, cf);
|
||||
|
||||
@@ -17,7 +17,6 @@ target_sources(scylla_auth
|
||||
password_authenticator.cc
|
||||
passwords.cc
|
||||
permission.cc
|
||||
permissions_cache.cc
|
||||
resource.cc
|
||||
role_or_anonymous.cc
|
||||
roles-metadata.cc
|
||||
|
||||
183
auth/cache.cc
183
auth/cache.cc
@@ -8,6 +8,7 @@
|
||||
|
||||
#include "auth/cache.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "auth/role_or_anonymous.hh"
|
||||
#include "auth/roles-metadata.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
@@ -18,6 +19,8 @@
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include <seastar/core/format.hh>
|
||||
#include <seastar/core/metrics.hh>
|
||||
#include <seastar/core/do_with.hh>
|
||||
|
||||
namespace auth {
|
||||
|
||||
@@ -27,7 +30,21 @@ cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
|
||||
: _current_version(0)
|
||||
, _qp(qp)
|
||||
, _loading_sem(1)
|
||||
, _as(as) {
|
||||
, _as(as)
|
||||
, _permission_loader(nullptr)
|
||||
, _permission_loader_sem(8) {
|
||||
namespace sm = seastar::metrics;
|
||||
_metrics.add_group("auth_cache", {
|
||||
sm::make_gauge("roles", [this] { return _roles.size(); },
|
||||
sm::description("Number of roles currently cached")),
|
||||
sm::make_gauge("permissions", [this] {
|
||||
return _cached_permissions_count;
|
||||
}, sm::description("Total number of permission sets currently cached across all roles"))
|
||||
});
|
||||
}
|
||||
|
||||
void cache::set_permission_loader(permission_loader_func loader) {
|
||||
_permission_loader = std::move(loader);
|
||||
}
|
||||
|
||||
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
|
||||
@@ -38,6 +55,83 @@ lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) cons
|
||||
return it->second;
|
||||
}
|
||||
|
||||
future<permission_set> cache::get_permissions(const role_or_anonymous& role, const resource& r) {
|
||||
std::unordered_map<resource, permission_set>* perms_cache;
|
||||
lw_shared_ptr<role_record> role_ptr;
|
||||
|
||||
if (is_anonymous(role)) {
|
||||
perms_cache = &_anonymous_permissions;
|
||||
} else {
|
||||
const auto& role_name = *role.name;
|
||||
auto role_it = _roles.find(role_name);
|
||||
if (role_it == _roles.end()) {
|
||||
// Role might have been deleted but there are some connections
|
||||
// left which reference it. They should no longer have access to anything.
|
||||
return make_ready_future<permission_set>(permissions::NONE);
|
||||
}
|
||||
role_ptr = role_it->second;
|
||||
perms_cache = &role_ptr->cached_permissions;
|
||||
}
|
||||
|
||||
if (auto it = perms_cache->find(r); it != perms_cache->end()) {
|
||||
return make_ready_future<permission_set>(it->second);
|
||||
}
|
||||
// keep alive role_ptr as it holds perms_cache (except anonymous)
|
||||
return do_with(std::move(role_ptr), [this, &role, &r, perms_cache] (auto& role_ptr) {
|
||||
return load_permissions(role, r, perms_cache);
|
||||
});
|
||||
}
|
||||
|
||||
future<permission_set> cache::load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache) {
|
||||
SCYLLA_ASSERT(_permission_loader);
|
||||
auto units = co_await get_units(_permission_loader_sem, 1, _as);
|
||||
|
||||
// Check again, perhaps we were blocked and other call loaded
|
||||
// the permissions already. This is a protection against misses storm.
|
||||
if (auto it = perms_cache->find(r); it != perms_cache->end()) {
|
||||
co_return it->second;
|
||||
}
|
||||
auto perms = co_await _permission_loader(role, r);
|
||||
add_permissions(*perms_cache, r, perms);
|
||||
co_return perms;
|
||||
}
|
||||
|
||||
future<> cache::prune(const resource& r) {
|
||||
auto units = co_await get_units(_loading_sem, 1, _as);
|
||||
_anonymous_permissions.erase(r);
|
||||
for (auto& it : _roles) {
|
||||
// Prunning can run concurrently with other functions but it
|
||||
// can only cause cached_permissions extra reload via get_permissions.
|
||||
remove_permissions(it.second->cached_permissions, r);
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
}
|
||||
|
||||
future<> cache::reload_all_permissions() noexcept {
|
||||
SCYLLA_ASSERT(_permission_loader);
|
||||
auto units = co_await get_units(_loading_sem, 1, _as);
|
||||
auto copy_keys = [] (const std::unordered_map<resource, permission_set>& m) {
|
||||
std::vector<resource> keys;
|
||||
keys.reserve(m.size());
|
||||
for (const auto& [res, _] : m) {
|
||||
keys.push_back(res);
|
||||
}
|
||||
return keys;
|
||||
};
|
||||
const role_or_anonymous anon;
|
||||
for (const auto& res : copy_keys(_anonymous_permissions)) {
|
||||
_anonymous_permissions[res] = co_await _permission_loader(anon, res);
|
||||
}
|
||||
for (auto& [role, entry] : _roles) {
|
||||
auto& perms_cache = entry->cached_permissions;
|
||||
auto r = role_or_anonymous(role);
|
||||
for (const auto& res : copy_keys(perms_cache)) {
|
||||
perms_cache[res] = co_await _permission_loader(r, res);
|
||||
}
|
||||
}
|
||||
logger.debug("Reloaded auth cache with {} entries", _roles.size());
|
||||
}
|
||||
|
||||
future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& role) const {
|
||||
auto rec = make_lw_shared<role_record>();
|
||||
rec->version = _current_version;
|
||||
@@ -105,7 +199,7 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
|
||||
future<> cache::prune_all() noexcept {
|
||||
for (auto it = _roles.begin(); it != _roles.end(); ) {
|
||||
if (it->second->version != _current_version) {
|
||||
_roles.erase(it++);
|
||||
remove_role(it++);
|
||||
co_await coroutine::maybe_yield();
|
||||
} else {
|
||||
++it;
|
||||
@@ -129,7 +223,7 @@ future<> cache::load_all() {
|
||||
const auto name = r.get_as<sstring>("role");
|
||||
auto role = co_await fetch_role(name);
|
||||
if (role) {
|
||||
_roles[name] = role;
|
||||
add_role(name, role);
|
||||
}
|
||||
co_return stop_iteration::no;
|
||||
};
|
||||
@@ -142,11 +236,32 @@ future<> cache::load_all() {
|
||||
co_await distribute_role(name, role);
|
||||
}
|
||||
co_await container().invoke_on_others([this](cache& c) -> future<> {
|
||||
auto units = co_await get_units(c._loading_sem, 1, c._as);
|
||||
c._current_version = _current_version;
|
||||
co_await c.prune_all();
|
||||
});
|
||||
}
|
||||
|
||||
future<> cache::gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name) {
|
||||
if (!role) {
|
||||
// Role might have been removed or not yet added, either way
|
||||
// their members will be handled by another top call to this function.
|
||||
co_return;
|
||||
}
|
||||
for (const auto& member_name : role->members) {
|
||||
bool is_new = roles.insert(member_name).second;
|
||||
if (!is_new) {
|
||||
continue;
|
||||
}
|
||||
lw_shared_ptr<cache::role_record> member_role;
|
||||
auto r = _roles.find(member_name);
|
||||
if (r != _roles.end()) {
|
||||
member_role = r->second;
|
||||
}
|
||||
co_await gather_inheriting_roles(roles, member_role, member_name);
|
||||
}
|
||||
}
|
||||
|
||||
future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
|
||||
if (legacy_mode(_qp)) {
|
||||
co_return;
|
||||
@@ -154,27 +269,41 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
|
||||
SCYLLA_ASSERT(this_shard_id() == 0);
|
||||
auto units = co_await get_units(_loading_sem, 1, _as);
|
||||
|
||||
std::unordered_set<role_name_t> roles_to_clear_perms;
|
||||
for (const auto& name : roles) {
|
||||
logger.info("Loading role {}", name);
|
||||
auto role = co_await fetch_role(name);
|
||||
if (role) {
|
||||
_roles[name] = role;
|
||||
add_role(name, role);
|
||||
co_await gather_inheriting_roles(roles_to_clear_perms, role, name);
|
||||
} else {
|
||||
_roles.erase(name);
|
||||
if (auto it = _roles.find(name); it != _roles.end()) {
|
||||
auto old_role = it->second;
|
||||
remove_role(it);
|
||||
co_await gather_inheriting_roles(roles_to_clear_perms, old_role, name);
|
||||
}
|
||||
}
|
||||
co_await distribute_role(name, role);
|
||||
}
|
||||
|
||||
co_await container().invoke_on_all([&roles_to_clear_perms] (cache& c) -> future<> {
|
||||
for (const auto& name : roles_to_clear_perms) {
|
||||
c.clear_role_permissions(name);
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> cache::distribute_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
|
||||
auto role_ptr = role.get();
|
||||
co_await container().invoke_on_others([&name, role_ptr](cache& c) {
|
||||
co_await container().invoke_on_others([&name, role_ptr](cache& c) -> future<> {
|
||||
auto units = co_await get_units(c._loading_sem, 1, c._as);
|
||||
if (!role_ptr) {
|
||||
c._roles.erase(name);
|
||||
return;
|
||||
c.remove_role(name);
|
||||
co_return;
|
||||
}
|
||||
auto role_copy = make_lw_shared<role_record>(*role_ptr);
|
||||
c._roles[name] = std::move(role_copy);
|
||||
c.add_role(name, std::move(role_copy));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -185,4 +314,40 @@ bool cache::includes_table(const table_id& id) noexcept {
|
||||
|| id == db::system_keyspace::role_permissions()->id();
|
||||
}
|
||||
|
||||
void cache::add_role(const role_name_t& name, lw_shared_ptr<role_record> role) {
|
||||
if (auto it = _roles.find(name); it != _roles.end()) {
|
||||
_cached_permissions_count -= it->second->cached_permissions.size();
|
||||
}
|
||||
_cached_permissions_count += role->cached_permissions.size();
|
||||
_roles[name] = std::move(role);
|
||||
}
|
||||
|
||||
void cache::remove_role(const role_name_t& name) {
|
||||
if (auto it = _roles.find(name); it != _roles.end()) {
|
||||
remove_role(it);
|
||||
}
|
||||
}
|
||||
|
||||
void cache::remove_role(roles_map::iterator it) {
|
||||
_cached_permissions_count -= it->second->cached_permissions.size();
|
||||
_roles.erase(it);
|
||||
}
|
||||
|
||||
void cache::clear_role_permissions(const role_name_t& name) {
|
||||
if (auto it = _roles.find(name); it != _roles.end()) {
|
||||
_cached_permissions_count -= it->second->cached_permissions.size();
|
||||
it->second->cached_permissions.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void cache::add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms) {
|
||||
if (cache.emplace(r, perms).second) {
|
||||
++_cached_permissions_count;
|
||||
}
|
||||
}
|
||||
|
||||
void cache::remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r) {
|
||||
_cached_permissions_count -= cache.erase(r);
|
||||
}
|
||||
|
||||
} // namespace auth
|
||||
|
||||
@@ -17,11 +17,14 @@
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/metrics_registration.hh>
|
||||
|
||||
#include <absl/container/flat_hash_map.h>
|
||||
|
||||
#include "auth/permission.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "auth/resource.hh"
|
||||
#include "auth/role_or_anonymous.hh"
|
||||
|
||||
namespace cql3 { class query_processor; }
|
||||
|
||||
@@ -31,6 +34,7 @@ class cache : public peering_sharded_service<cache> {
|
||||
public:
|
||||
using role_name_t = sstring;
|
||||
using version_tag_t = char;
|
||||
using permission_loader_func = std::function<future<permission_set>(const role_or_anonymous&, const resource&)>;
|
||||
|
||||
struct role_record {
|
||||
bool can_login = false;
|
||||
@@ -40,11 +44,19 @@ public:
|
||||
sstring salted_hash;
|
||||
std::unordered_map<sstring, sstring> attributes;
|
||||
std::unordered_map<sstring, permission_set> permissions;
|
||||
private:
|
||||
friend cache;
|
||||
// cached permissions include effects of role's inheritance
|
||||
std::unordered_map<resource, permission_set> cached_permissions;
|
||||
version_tag_t version; // used for seamless cache reloads
|
||||
};
|
||||
|
||||
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
|
||||
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
|
||||
void set_permission_loader(permission_loader_func loader);
|
||||
future<permission_set> get_permissions(const role_or_anonymous& role, const resource& r);
|
||||
future<> prune(const resource& r);
|
||||
future<> reload_all_permissions() noexcept;
|
||||
future<> load_all();
|
||||
future<> load_roles(std::unordered_set<role_name_t> roles);
|
||||
static bool includes_table(const table_id&) noexcept;
|
||||
@@ -52,14 +64,31 @@ public:
|
||||
private:
|
||||
using roles_map = absl::flat_hash_map<role_name_t, lw_shared_ptr<role_record>>;
|
||||
roles_map _roles;
|
||||
// anonymous permissions map exists mainly due to compatibility with
|
||||
// higher layers which use role_or_anonymous to get permissions.
|
||||
std::unordered_map<resource, permission_set> _anonymous_permissions;
|
||||
version_tag_t _current_version;
|
||||
cql3::query_processor& _qp;
|
||||
semaphore _loading_sem;
|
||||
semaphore _loading_sem; // protects iteration of _roles map
|
||||
abort_source& _as;
|
||||
permission_loader_func _permission_loader;
|
||||
semaphore _permission_loader_sem; // protects against reload storms on a single role change
|
||||
metrics::metric_groups _metrics;
|
||||
size_t _cached_permissions_count = 0;
|
||||
|
||||
future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
|
||||
future<> prune_all() noexcept;
|
||||
future<> distribute_role(const role_name_t& name, const lw_shared_ptr<role_record> role);
|
||||
future<> gather_inheriting_roles(std::unordered_set<role_name_t>& roles, lw_shared_ptr<cache::role_record> role, const role_name_t& name);
|
||||
|
||||
void add_role(const role_name_t& name, lw_shared_ptr<role_record> role);
|
||||
void remove_role(const role_name_t& name);
|
||||
void remove_role(roles_map::iterator it);
|
||||
void clear_role_permissions(const role_name_t& name);
|
||||
void add_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r, permission_set perms);
|
||||
void remove_permissions(std::unordered_map<resource, permission_set>& cache, const resource& r);
|
||||
|
||||
future<permission_set> load_permissions(const role_or_anonymous& role, const resource& r, std::unordered_map<resource, permission_set>* perms_cache);
|
||||
};
|
||||
|
||||
} // namespace auth
|
||||
|
||||
@@ -88,10 +88,16 @@ static const class_registrator<
|
||||
|
||||
ldap_role_manager::ldap_role_manager(
|
||||
std::string_view query_template, std::string_view target_attr, std::string_view bind_name, std::string_view bind_password,
|
||||
uint32_t permissions_update_interval_in_ms,
|
||||
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
|
||||
cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
: _std_mgr(qp, rg0c, mm, cache), _group0_client(rg0c), _query_template(query_template), _target_attr(target_attr), _bind_name(bind_name)
|
||||
, _bind_password(bind_password)
|
||||
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this))) {
|
||||
, _permissions_update_interval_in_ms(permissions_update_interval_in_ms)
|
||||
, _permissions_update_interval_in_ms_observer(std::move(permissions_update_interval_in_ms_observer))
|
||||
, _connection_factory(bind(std::mem_fn(&ldap_role_manager::reconnect), std::ref(*this)))
|
||||
, _cache(cache)
|
||||
, _cache_pruner(make_ready_future<>()) {
|
||||
}
|
||||
|
||||
ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_group0_client& rg0c, ::service::migration_manager& mm, cache& cache)
|
||||
@@ -100,6 +106,8 @@ ldap_role_manager::ldap_role_manager(cql3::query_processor& qp, ::service::raft_
|
||||
qp.db().get_config().ldap_attr_role(),
|
||||
qp.db().get_config().ldap_bind_dn(),
|
||||
qp.db().get_config().ldap_bind_passwd(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms(),
|
||||
qp.db().get_config().permissions_update_interval_in_ms.observe([this] (const uint32_t& v) { _permissions_update_interval_in_ms = v; }),
|
||||
qp,
|
||||
rg0c,
|
||||
mm,
|
||||
@@ -119,6 +127,22 @@ future<> ldap_role_manager::start() {
|
||||
return make_exception_future(
|
||||
std::runtime_error(fmt::format("error getting LDAP server address from template {}", _query_template)));
|
||||
}
|
||||
_cache_pruner = futurize_invoke([this] () -> future<> {
|
||||
while (true) {
|
||||
try {
|
||||
co_await seastar::sleep_abortable(std::chrono::milliseconds(_permissions_update_interval_in_ms), _as);
|
||||
} catch (const seastar::sleep_aborted&) {
|
||||
co_return; // ignore
|
||||
}
|
||||
co_await _cache.container().invoke_on_all([] (cache& c) -> future<> {
|
||||
try {
|
||||
co_await c.reload_all_permissions();
|
||||
} catch (...) {
|
||||
mylog.warn("Cache reload all permissions failed: {}", std::current_exception());
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
return _std_mgr.start();
|
||||
}
|
||||
|
||||
@@ -175,7 +199,11 @@ future<conn_ptr> ldap_role_manager::reconnect() {
|
||||
|
||||
future<> ldap_role_manager::stop() {
|
||||
_as.request_abort();
|
||||
return _std_mgr.stop().then([this] { return _connection_factory.stop(); });
|
||||
return std::move(_cache_pruner).then([this] {
|
||||
return _std_mgr.stop();
|
||||
}).then([this] {
|
||||
return _connection_factory.stop();
|
||||
});
|
||||
}
|
||||
|
||||
future<> ldap_role_manager::create(std::string_view name, const role_config& config, ::service::group0_batch& mc) {
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "ent/ldap/ldap_connection.hh"
|
||||
@@ -34,14 +35,22 @@ class ldap_role_manager : public role_manager {
|
||||
seastar::sstring _target_attr; ///< LDAP entry attribute containing the Scylla role name.
|
||||
seastar::sstring _bind_name; ///< Username for LDAP simple bind.
|
||||
seastar::sstring _bind_password; ///< Password for LDAP simple bind.
|
||||
|
||||
uint32_t _permissions_update_interval_in_ms;
|
||||
utils::observer<uint32_t> _permissions_update_interval_in_ms_observer;
|
||||
|
||||
mutable ldap_reuser _connection_factory; // Potentially modified by query_granted().
|
||||
seastar::abort_source _as;
|
||||
cache& _cache;
|
||||
seastar::future<> _cache_pruner;
|
||||
public:
|
||||
ldap_role_manager(
|
||||
std::string_view query_template, ///< LDAP query template as described in Scylla documentation.
|
||||
std::string_view target_attr, ///< LDAP entry attribute containing the Scylla role name.
|
||||
std::string_view bind_name, ///< LDAP bind credentials.
|
||||
std::string_view bind_password, ///< LDAP bind credentials.
|
||||
uint32_t permissions_update_interval_in_ms,
|
||||
utils::observer<uint32_t> permissions_update_interval_in_ms_observer,
|
||||
cql3::query_processor& qp, ///< Passed to standard_role_manager.
|
||||
::service::raft_group0_client& rg0c, ///< Passed to standard_role_manager.
|
||||
::service::migration_manager& mm, ///< Passed to standard_role_manager.
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include "auth/permissions_cache.hh"
|
||||
|
||||
#include <fmt/ranges.h>
|
||||
#include "auth/authorizer.hh"
|
||||
#include "auth/service.hh"
|
||||
|
||||
namespace auth {
|
||||
|
||||
permissions_cache::permissions_cache(const utils::loading_cache_config& c, service& ser, logging::logger& log)
|
||||
: _cache(c, log, [&ser, &log](const key_type& k) {
|
||||
log.debug("Refreshing permissions for {}", k.first);
|
||||
return ser.get_uncached_permissions(k.first, k.second);
|
||||
}) {
|
||||
}
|
||||
|
||||
bool permissions_cache::update_config(utils::loading_cache_config c) {
|
||||
return _cache.update_config(std::move(c));
|
||||
}
|
||||
|
||||
void permissions_cache::reset() {
|
||||
_cache.reset();
|
||||
}
|
||||
|
||||
future<permission_set> permissions_cache::get(const role_or_anonymous& maybe_role, const resource& r) {
|
||||
return do_with(key_type(maybe_role, r), [this](const auto& k) {
|
||||
return _cache.get(k);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2017-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <utility>
|
||||
|
||||
#include <fmt/core.h>
|
||||
#include <seastar/core/future.hh>
|
||||
|
||||
#include "auth/permission.hh"
|
||||
#include "auth/resource.hh"
|
||||
#include "auth/role_or_anonymous.hh"
|
||||
#include "utils/log.hh"
|
||||
#include "utils/hash.hh"
|
||||
#include "utils/loading_cache.hh"
|
||||
|
||||
namespace std {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const pair<auth::role_or_anonymous, auth::resource>& p) {
|
||||
fmt::print(os, "{{role: {}, resource: {}}}", p.first, p.second);
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace db {
|
||||
class config;
|
||||
}
|
||||
|
||||
namespace auth {
|
||||
|
||||
class service;
|
||||
|
||||
class permissions_cache final {
|
||||
using cache_type = utils::loading_cache<
|
||||
std::pair<role_or_anonymous, resource>,
|
||||
permission_set,
|
||||
1,
|
||||
utils::loading_cache_reload_enabled::yes,
|
||||
utils::simple_entry_size<permission_set>,
|
||||
utils::tuple_hash>;
|
||||
|
||||
using key_type = typename cache_type::key_type;
|
||||
|
||||
cache_type _cache;
|
||||
|
||||
public:
|
||||
explicit permissions_cache(const utils::loading_cache_config&, service&, logging::logger&);
|
||||
|
||||
future <> stop() {
|
||||
return _cache.stop();
|
||||
}
|
||||
|
||||
bool update_config(utils::loading_cache_config);
|
||||
void reset();
|
||||
future<permission_set> get(const role_or_anonymous&, const resource&);
|
||||
};
|
||||
|
||||
}
|
||||
@@ -64,11 +64,11 @@ static const sstring superuser_col_name("super");
|
||||
static logging::logger log("auth_service");
|
||||
|
||||
class auth_migration_listener final : public ::service::migration_listener {
|
||||
authorizer& _authorizer;
|
||||
service& _service;
|
||||
cql3::query_processor& _qp;
|
||||
|
||||
public:
|
||||
explicit auth_migration_listener(authorizer& a, cql3::query_processor& qp) : _authorizer(a), _qp(qp) {
|
||||
explicit auth_migration_listener(service& s, cql3::query_processor& qp) : _service(s), _qp(qp) {
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -92,14 +92,14 @@ private:
|
||||
return;
|
||||
}
|
||||
// Do it in the background.
|
||||
(void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
|
||||
return _authorizer.revoke_all(auth::make_data_resource(ks_name), mc);
|
||||
(void)do_with(auth::make_data_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
|
||||
return _service.revoke_all(r, mc);
|
||||
}).handle_exception([] (std::exception_ptr e) {
|
||||
log.error("Unexpected exception while revoking all permissions on dropped keyspace: {}", e);
|
||||
});
|
||||
|
||||
(void)do_with(::service::group0_batch::unused(), [this, &ks_name] (auto& mc) mutable {
|
||||
return _authorizer.revoke_all(auth::make_functions_resource(ks_name), mc);
|
||||
(void)do_with(auth::make_functions_resource(ks_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
|
||||
return _service.revoke_all(r, mc);
|
||||
}).handle_exception([] (std::exception_ptr e) {
|
||||
log.error("Unexpected exception while revoking all permissions on functions in dropped keyspace: {}", e);
|
||||
});
|
||||
@@ -111,9 +111,8 @@ private:
|
||||
return;
|
||||
}
|
||||
// Do it in the background.
|
||||
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &cf_name] (auto& mc) mutable {
|
||||
return _authorizer.revoke_all(
|
||||
auth::make_data_resource(ks_name, cf_name), mc);
|
||||
(void)do_with(auth::make_data_resource(ks_name, cf_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
|
||||
return _service.revoke_all(r, mc);
|
||||
}).handle_exception([] (std::exception_ptr e) {
|
||||
log.error("Unexpected exception while revoking all permissions on dropped table: {}", e);
|
||||
});
|
||||
@@ -126,9 +125,8 @@ private:
|
||||
return;
|
||||
}
|
||||
// Do it in the background.
|
||||
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &function_name] (auto& mc) mutable {
|
||||
return _authorizer.revoke_all(
|
||||
auth::make_functions_resource(ks_name, function_name), mc);
|
||||
(void)do_with(auth::make_functions_resource(ks_name, function_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
|
||||
return _service.revoke_all(r, mc);
|
||||
}).handle_exception([] (std::exception_ptr e) {
|
||||
log.error("Unexpected exception while revoking all permissions on dropped function: {}", e);
|
||||
});
|
||||
@@ -138,9 +136,8 @@ private:
|
||||
// in non legacy path revoke is part of schema change statement execution
|
||||
return;
|
||||
}
|
||||
(void)do_with(::service::group0_batch::unused(), [this, &ks_name, &aggregate_name] (auto& mc) mutable {
|
||||
return _authorizer.revoke_all(
|
||||
auth::make_functions_resource(ks_name, aggregate_name), mc);
|
||||
(void)do_with(auth::make_functions_resource(ks_name, aggregate_name), ::service::group0_batch::unused(), [this] (auto& r, auto& mc) mutable {
|
||||
return _service.revoke_all(r, mc);
|
||||
}).handle_exception([] (std::exception_ptr e) {
|
||||
log.error("Unexpected exception while revoking all permissions on dropped aggregate: {}", e);
|
||||
});
|
||||
@@ -157,7 +154,6 @@ static future<> validate_role_exists(const service& ser, std::string_view role_n
|
||||
}
|
||||
|
||||
service::service(
|
||||
utils::loading_cache_config c,
|
||||
cache& cache,
|
||||
cql3::query_processor& qp,
|
||||
::service::raft_group0_client& g0,
|
||||
@@ -166,25 +162,17 @@ service::service(
|
||||
std::unique_ptr<authenticator> a,
|
||||
std::unique_ptr<role_manager> r,
|
||||
maintenance_socket_enabled used_by_maintenance_socket)
|
||||
: _loading_cache_config(std::move(c))
|
||||
, _permissions_cache(nullptr)
|
||||
, _cache(cache)
|
||||
: _cache(cache)
|
||||
, _qp(qp)
|
||||
, _group0_client(g0)
|
||||
, _mnotifier(mn)
|
||||
, _authorizer(std::move(z))
|
||||
, _authenticator(std::move(a))
|
||||
, _role_manager(std::move(r))
|
||||
, _migration_listener(std::make_unique<auth_migration_listener>(*_authorizer, qp))
|
||||
, _permissions_cache_cfg_cb([this] (uint32_t) { (void) _permissions_cache_config_action.trigger_later(); })
|
||||
, _permissions_cache_config_action([this] { update_cache_config(); return make_ready_future<>(); })
|
||||
, _permissions_cache_max_entries_observer(_qp.db().get_config().permissions_cache_max_entries.observe(_permissions_cache_cfg_cb))
|
||||
, _permissions_cache_update_interval_in_ms_observer(_qp.db().get_config().permissions_update_interval_in_ms.observe(_permissions_cache_cfg_cb))
|
||||
, _permissions_cache_validity_in_ms_observer(_qp.db().get_config().permissions_validity_in_ms.observe(_permissions_cache_cfg_cb))
|
||||
, _migration_listener(std::make_unique<auth_migration_listener>(*this, qp))
|
||||
, _used_by_maintenance_socket(used_by_maintenance_socket) {}
|
||||
|
||||
service::service(
|
||||
utils::loading_cache_config c,
|
||||
cql3::query_processor& qp,
|
||||
::service::raft_group0_client& g0,
|
||||
::service::migration_notifier& mn,
|
||||
@@ -193,7 +181,6 @@ service::service(
|
||||
maintenance_socket_enabled used_by_maintenance_socket,
|
||||
cache& cache)
|
||||
: service(
|
||||
std::move(c),
|
||||
cache,
|
||||
qp,
|
||||
g0,
|
||||
@@ -257,7 +244,14 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s
|
||||
co_await _role_manager->ensure_superuser_is_created();
|
||||
}
|
||||
co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result();
|
||||
_permissions_cache = std::make_unique<permissions_cache>(_loading_cache_config, *this, log);
|
||||
if (!_used_by_maintenance_socket) {
|
||||
// Maintenance socket mode can't cache permissions because it has
|
||||
// different authorizer. We can't mix cached permissions, they could be
|
||||
// different in normal mode.
|
||||
_cache.set_permission_loader(std::bind(
|
||||
&service::get_uncached_permissions,
|
||||
this, std::placeholders::_1, std::placeholders::_2));
|
||||
}
|
||||
co_await once_among_shards([this] {
|
||||
_mnotifier.register_listener(_migration_listener.get());
|
||||
return make_ready_future<>();
|
||||
@@ -269,9 +263,7 @@ future<> service::stop() {
|
||||
// Only one of the shards has the listener registered, but let's try to
|
||||
// unregister on each one just to make sure.
|
||||
return _mnotifier.unregister_listener(_migration_listener.get()).then([this] {
|
||||
if (_permissions_cache) {
|
||||
return _permissions_cache->stop();
|
||||
}
|
||||
_cache.set_permission_loader(nullptr);
|
||||
return make_ready_future<>();
|
||||
}).then([this] {
|
||||
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result();
|
||||
@@ -283,21 +275,8 @@ future<> service::ensure_superuser_is_created() {
|
||||
co_await _authenticator->ensure_superuser_is_created();
|
||||
}
|
||||
|
||||
void service::update_cache_config() {
|
||||
auto db = _qp.db();
|
||||
|
||||
utils::loading_cache_config perm_cache_config;
|
||||
perm_cache_config.max_size = db.get_config().permissions_cache_max_entries();
|
||||
perm_cache_config.expiry = std::chrono::milliseconds(db.get_config().permissions_validity_in_ms());
|
||||
perm_cache_config.refresh = std::chrono::milliseconds(db.get_config().permissions_update_interval_in_ms());
|
||||
|
||||
if (!_permissions_cache->update_config(std::move(perm_cache_config))) {
|
||||
log.error("Failed to apply permissions cache changes. Please read the documentation of these parameters");
|
||||
}
|
||||
}
|
||||
|
||||
void service::reset_authorization_cache() {
|
||||
_permissions_cache->reset();
|
||||
_qp.reset_cache();
|
||||
}
|
||||
|
||||
@@ -322,7 +301,10 @@ service::get_uncached_permissions(const role_or_anonymous& maybe_role, const res
|
||||
}
|
||||
|
||||
future<permission_set> service::get_permissions(const role_or_anonymous& maybe_role, const resource& r) const {
|
||||
return _permissions_cache->get(maybe_role, r);
|
||||
if (legacy_mode(_qp) || _used_by_maintenance_socket) {
|
||||
return get_uncached_permissions(maybe_role, r);
|
||||
}
|
||||
return _cache.get_permissions(maybe_role, r);
|
||||
}
|
||||
|
||||
future<bool> service::has_superuser(std::string_view role_name, const role_set& roles) const {
|
||||
@@ -447,6 +429,11 @@ future<bool> service::exists(const resource& r) const {
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
|
||||
future<> service::revoke_all(const resource& r, ::service::group0_batch& mc) const {
|
||||
co_await _authorizer->revoke_all(r, mc);
|
||||
co_await _cache.prune(r);
|
||||
}
|
||||
|
||||
future<std::vector<cql3::description>> service::describe_roles(bool with_hashed_passwords) {
|
||||
std::vector<cql3::description> result{};
|
||||
|
||||
@@ -801,7 +788,7 @@ future<> revoke_permissions(
|
||||
}
|
||||
|
||||
future<> revoke_all(const service& ser, const resource& r, ::service::group0_batch& mc) {
|
||||
return ser.underlying_authorizer().revoke_all(r, mc);
|
||||
return ser.revoke_all(r, mc);
|
||||
}
|
||||
|
||||
future<std::vector<permission_details>> list_filtered_permissions(
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include "auth/authenticator.hh"
|
||||
#include "auth/authorizer.hh"
|
||||
#include "auth/permission.hh"
|
||||
#include "auth/permissions_cache.hh"
|
||||
#include "auth/cache.hh"
|
||||
#include "auth/role_manager.hh"
|
||||
#include "auth/common.hh"
|
||||
@@ -75,8 +74,6 @@ public:
|
||||
/// peering_sharded_service inheritance is needed to be able to access shard local authentication service
|
||||
/// given an object from another shard. Used for bouncing lwt requests to correct shard.
|
||||
class service final : public seastar::peering_sharded_service<service> {
|
||||
utils::loading_cache_config _loading_cache_config;
|
||||
std::unique_ptr<permissions_cache> _permissions_cache;
|
||||
cache& _cache;
|
||||
|
||||
cql3::query_processor& _qp;
|
||||
@@ -94,20 +91,12 @@ class service final : public seastar::peering_sharded_service<service> {
|
||||
// Only one of these should be registered, so we end up with some unused instances. Not the end of the world.
|
||||
std::unique_ptr<::service::migration_listener> _migration_listener;
|
||||
|
||||
std::function<void(uint32_t)> _permissions_cache_cfg_cb;
|
||||
serialized_action _permissions_cache_config_action;
|
||||
|
||||
utils::observer<uint32_t> _permissions_cache_max_entries_observer;
|
||||
utils::observer<uint32_t> _permissions_cache_update_interval_in_ms_observer;
|
||||
utils::observer<uint32_t> _permissions_cache_validity_in_ms_observer;
|
||||
|
||||
maintenance_socket_enabled _used_by_maintenance_socket;
|
||||
|
||||
abort_source _as;
|
||||
|
||||
public:
|
||||
service(
|
||||
utils::loading_cache_config,
|
||||
cache& cache,
|
||||
cql3::query_processor&,
|
||||
::service::raft_group0_client&,
|
||||
@@ -123,7 +112,6 @@ public:
|
||||
/// of the instances themselves.
|
||||
///
|
||||
service(
|
||||
utils::loading_cache_config,
|
||||
cql3::query_processor&,
|
||||
::service::raft_group0_client&,
|
||||
::service::migration_notifier&,
|
||||
@@ -138,8 +126,6 @@ public:
|
||||
|
||||
future<> ensure_superuser_is_created();
|
||||
|
||||
void update_cache_config();
|
||||
|
||||
void reset_authorization_cache();
|
||||
|
||||
///
|
||||
@@ -181,6 +167,13 @@ public:
|
||||
|
||||
future<bool> exists(const resource&) const;
|
||||
|
||||
///
|
||||
/// Revoke all permissions granted to any role for a particular resource.
|
||||
///
|
||||
/// \throws \ref unsupported_authorization_operation if revoking permissions is not supported.
|
||||
///
|
||||
future<> revoke_all(const resource&, ::service::group0_batch&) const;
|
||||
|
||||
///
|
||||
/// Produces descriptions that can be used to restore the state of auth. That encompasses
|
||||
/// roles, role grants, and permission grants.
|
||||
|
||||
@@ -299,13 +299,11 @@ batch_size_fail_threshold_in_kb: 1024
|
||||
# max_hint_window_in_ms: 10800000 # 3 hours
|
||||
|
||||
|
||||
# Validity period for permissions cache (fetching permissions can be an
|
||||
# expensive operation depending on the authorizer, CassandraAuthorizer is
|
||||
# one example). Defaults to 10000, set to 0 to disable.
|
||||
# Validity period for authorized statements cache. Defaults to 10000, set to 0 to disable.
|
||||
# Will be disabled automatically for AllowAllAuthorizer.
|
||||
# permissions_validity_in_ms: 10000
|
||||
|
||||
# Refresh interval for permissions cache (if enabled).
|
||||
# Refresh interval for authorized statements cache.
|
||||
# After this interval, cache entries become eligible for refresh. Upon next
|
||||
# access, an async reload is scheduled and the old value returned until it
|
||||
# completes. If permissions_validity_in_ms is non-zero, then this also must have
|
||||
@@ -566,15 +564,16 @@ commitlog_total_space_in_mb: -1
|
||||
# prometheus_address: 1.2.3.4
|
||||
|
||||
# audit settings
|
||||
# By default, Scylla does not audit anything.
|
||||
# Table audit is enabled by default.
|
||||
# 'audit' config option controls if and where to output audited events:
|
||||
# - "none": auditing is disabled (default)
|
||||
# - "table": save audited events in audit.audit_log column family
|
||||
# - "none": auditing is disabled
|
||||
# - "table": save audited events in audit.audit_log column family (default)
|
||||
# - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
|
||||
audit: "table"
|
||||
#
|
||||
# List of statement categories that should be audited.
|
||||
audit_categories: "DCL,DDL,AUTH,ADMIN"
|
||||
# Possible categories are: QUERY, DML, DCL, DDL, AUTH, ADMIN
|
||||
audit_categories: "DCL,AUTH,ADMIN"
|
||||
#
|
||||
# List of tables that should be audited.
|
||||
# audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"
|
||||
|
||||
@@ -1192,6 +1192,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'utils/azure/identity/default_credentials.cc',
|
||||
'utils/gcp/gcp_credentials.cc',
|
||||
'utils/gcp/object_storage.cc',
|
||||
'utils/gcp/object_storage_retry_strategy.cc',
|
||||
'gms/version_generator.cc',
|
||||
'gms/versioned_value.cc',
|
||||
'gms/gossiper.cc',
|
||||
@@ -1276,7 +1277,6 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'auth/passwords.cc',
|
||||
'auth/password_authenticator.cc',
|
||||
'auth/permission.cc',
|
||||
'auth/permissions_cache.cc',
|
||||
'auth/service.cc',
|
||||
'auth/standard_role_manager.cc',
|
||||
'auth/ldap_role_manager.cc',
|
||||
@@ -1646,6 +1646,7 @@ for t in sorted(perf_tests):
|
||||
|
||||
deps['test/boost/combined_tests'] += [
|
||||
'test/boost/aggregate_fcts_test.cc',
|
||||
'test/boost/auth_cache_test.cc',
|
||||
'test/boost/auth_test.cc',
|
||||
'test/boost/batchlog_manager_test.cc',
|
||||
'test/boost/cache_algorithm_test.cc',
|
||||
|
||||
@@ -69,7 +69,7 @@ float compute_cosine_similarity(std::span<const float> v1, std::span<const float
|
||||
}
|
||||
|
||||
if (squared_norm_a == 0 || squared_norm_b == 0) {
|
||||
throw exceptions::invalid_request_exception("Function system.similarity_cosine doesn't support all-zero vectors");
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
// The cosine similarity is in the range [-1, 1].
|
||||
|
||||
16
db/config.cc
16
db/config.cc
@@ -1201,13 +1201,13 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"* org.apache.cassandra.auth.CassandraRoleManager: Stores role data in the system_auth keyspace;\n"
|
||||
"* com.scylladb.auth.LDAPRoleManager: Fetches role data from an LDAP server.")
|
||||
, permissions_validity_in_ms(this, "permissions_validity_in_ms", liveness::LiveUpdate, value_status::Used, 10000,
|
||||
"How long permissions in cache remain valid. Depending on the authorizer, such as CassandraAuthorizer, fetching permissions can be resource intensive. Permissions caching is disabled when this property is set to 0 or when AllowAllAuthorizer is used. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
|
||||
"How long authorized statements cache entries remain valid. The cached value is considered valid as long as both its value is not older than the permissions_validity_in_ms "
|
||||
"and the cached value has been read at least once during the permissions_validity_in_ms time frame. If any of these two conditions doesn't hold the cached value is going to be evicted from the cache.\n"
|
||||
"\n"
|
||||
"Related information: Object permissions")
|
||||
, permissions_update_interval_in_ms(this, "permissions_update_interval_in_ms", liveness::LiveUpdate, value_status::Used, 2000,
|
||||
"Refresh interval for permissions cache (if enabled). After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms.")
|
||||
, permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Used, 1000,
|
||||
"Refresh interval for authorized statements cache. After this interval, cache entries become eligible for refresh. An async reload is scheduled every permissions_update_interval_in_ms time period and the old value is returned until it completes. If permissions_validity_in_ms has a non-zero value, then this property must also have a non-zero value. It's recommended to set this value to be at least 3 times smaller than the permissions_validity_in_ms. This option additionally controls the permissions refresh interval for LDAP.")
|
||||
, permissions_cache_max_entries(this, "permissions_cache_max_entries", liveness::LiveUpdate, value_status::Unused, 1000,
|
||||
"Maximum cached permission entries. Must have a non-zero value if permissions caching is enabled (see a permissions_validity_in_ms description).")
|
||||
, server_encryption_options(this, "server_encryption_options", value_status::Used, {/*none*/},
|
||||
"Enable or disable inter-node encryption. You must also generate keys and provide the appropriate key and trust store locations and passwords. The available options are:\n"
|
||||
@@ -1272,7 +1272,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
|
||||
, override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
|
||||
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
|
||||
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
|
||||
, enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
|
||||
, enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
|
||||
"If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."
|
||||
@@ -1527,17 +1527,21 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Allows target tablet size to be configured. Defaults to 5G (in bytes). Maintaining tablets at reasonable sizes is important to be able to " \
|
||||
"redistribute load. A higher value means tablet migration throughput can be reduced. A lower value may cause number of tablets to increase significantly, " \
|
||||
"potentially resulting in performance drawbacks.")
|
||||
, tablet_streaming_read_concurrency_per_shard(this, "tablet_streaming_read_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
|
||||
"Maximum number of tablets which may be leaving a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
|
||||
, tablet_streaming_write_concurrency_per_shard(this, "tablet_streaming_write_concurrency_per_shard", liveness::LiveUpdate, value_status::Used, 2,
|
||||
"Maximum number of tablets which may be pending on a shard at the same time. Effecting only on topology coordinator. Set to the same value on all nodes.")
|
||||
, replication_strategy_warn_list(this, "replication_strategy_warn_list", liveness::LiveUpdate, value_status::Used, {locator::replication_strategy_type::simple}, "Controls which replication strategies to warn about when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
|
||||
, replication_strategy_fail_list(this, "replication_strategy_fail_list", liveness::LiveUpdate, value_status::Used, {}, "Controls which replication strategies are disallowed to be used when creating/altering a keyspace. Doesn't affect the pre-existing keyspaces.")
|
||||
, service_levels_interval(this, "service_levels_interval_ms", liveness::LiveUpdate, value_status::Used, 10000, "Controls how often service levels module polls configuration table")
|
||||
|
||||
, audit(this, "audit", value_status::Used, "none",
|
||||
, audit(this, "audit", value_status::Used, "table",
|
||||
"Controls the audit feature:\n"
|
||||
"\n"
|
||||
"\tnone : No auditing enabled.\n"
|
||||
"\tsyslog : Audit messages sent to Syslog.\n"
|
||||
"\ttable : Audit messages written to column family named audit.audit_log.\n")
|
||||
, audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,DDL,AUTH", "Comma separated list of operation categories that should be audited.")
|
||||
, audit_categories(this, "audit_categories", liveness::LiveUpdate, value_status::Used, "DCL,AUTH,ADMIN", "Comma separated list of operation categories that should be audited.")
|
||||
, audit_tables(this, "audit_tables", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of table names (<keyspace>.<table>) that will be audited.")
|
||||
, audit_keyspaces(this, "audit_keyspaces", liveness::LiveUpdate, value_status::Used, "", "Comma separated list of keyspaces that will be audited. All tables in those keyspaces will be audited")
|
||||
, audit_unix_socket_path(this, "audit_unix_socket_path", value_status::Used, "/dev/log", "The path to the unix socket used for writing to syslog. Only applicable when audit is set to syslog.")
|
||||
|
||||
@@ -542,6 +542,8 @@ public:
|
||||
named_value<double> tablets_initial_scale_factor;
|
||||
named_value<unsigned> tablets_per_shard_goal;
|
||||
named_value<uint64_t> target_tablet_size_in_bytes;
|
||||
named_value<unsigned> tablet_streaming_read_concurrency_per_shard;
|
||||
named_value<unsigned> tablet_streaming_write_concurrency_per_shard;
|
||||
|
||||
named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_warn_list;
|
||||
named_value<std::vector<enum_option<replication_strategy_restriction_t>>> replication_strategy_fail_list;
|
||||
|
||||
@@ -1714,7 +1714,9 @@ std::unordered_set<dht::token> decode_tokens(const set_type_impl::native_type& t
|
||||
std::unordered_set<dht::token> tset;
|
||||
for (auto& t: tokens) {
|
||||
auto str = value_cast<sstring>(t);
|
||||
SCYLLA_ASSERT(str == dht::token::from_sstring(str).to_sstring());
|
||||
if (str != dht::token::from_sstring(str).to_sstring()) {
|
||||
on_internal_error(slogger, format("decode_tokens: invalid token string '{}'", str));
|
||||
}
|
||||
tset.insert(dht::token::from_sstring(str));
|
||||
}
|
||||
return tset;
|
||||
@@ -3191,7 +3193,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
};
|
||||
}
|
||||
} else if (must_have_tokens(nstate)) {
|
||||
on_fatal_internal_error(slogger, format(
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: node {} in {} state but missing ring slice", host_id, nstate));
|
||||
}
|
||||
}
|
||||
@@ -3273,7 +3275,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
// Currently, at most one node at a time can be in transitioning state.
|
||||
if (!map->empty()) {
|
||||
const auto& [other_id, other_rs] = *map->begin();
|
||||
on_fatal_internal_error(slogger, format(
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: found two nodes in transitioning state: {} in {} state and {} in {} state",
|
||||
other_id, other_rs.state, host_id, nstate));
|
||||
}
|
||||
@@ -3331,8 +3333,7 @@ future<service::topology> system_keyspace::load_topology_state(const std::unorde
|
||||
format("SELECT count(range_end) as cnt FROM {}.{} WHERE key = '{}' AND id = ?",
|
||||
NAME, CDC_GENERATIONS_V3, cdc::CDC_GENERATIONS_V3_KEY),
|
||||
gen_id.id);
|
||||
SCYLLA_ASSERT(gen_rows);
|
||||
if (gen_rows->empty()) {
|
||||
if (!gen_rows || gen_rows->empty()) {
|
||||
on_internal_error(slogger, format(
|
||||
"load_topology_state: last committed CDC generation time UUID ({}) present, but data missing", gen_id.id));
|
||||
}
|
||||
|
||||
@@ -2308,6 +2308,7 @@ future<> view_builder::drain() {
|
||||
vlogger.info("Draining view builder");
|
||||
_as.request_abort();
|
||||
co_await _mnotifier.unregister_listener(this);
|
||||
co_await _ops_gate.close();
|
||||
co_await _vug.drain();
|
||||
co_await _sem.wait();
|
||||
_sem.broken();
|
||||
@@ -2742,30 +2743,48 @@ void view_builder::on_create_view(const sstring& ks_name, const sstring& view_na
|
||||
}
|
||||
|
||||
// Do it in the background, serialized and broadcast from shard 0.
|
||||
static_cast<void>(dispatch_create_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
|
||||
static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
|
||||
return dispatch_create_view(std::move(ks_name), std::move(view_name));
|
||||
}).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
|
||||
vlogger.warn("Failed to dispatch view creation {}.{}: {}", ks_name, view_name, ep);
|
||||
}));
|
||||
}
|
||||
|
||||
void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
|
||||
future<> view_builder::dispatch_update_view(sstring ks_name, sstring view_name) {
|
||||
if (should_ignore_tablet_keyspace(_db, ks_name)) {
|
||||
return;
|
||||
co_return;
|
||||
}
|
||||
|
||||
[[maybe_unused]] auto sem_units = co_await get_or_adopt_view_builder_lock(std::nullopt);
|
||||
|
||||
auto view = view_ptr(_db.find_schema(ks_name, view_name));
|
||||
auto step_it = _base_to_build_step.find(view->view_info()->base_id());
|
||||
if (step_it == _base_to_build_step.end()) {
|
||||
co_return; // In case all the views for this CF have finished building already.
|
||||
}
|
||||
auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
|
||||
return bs.view->id() == view->id();
|
||||
});
|
||||
if (status_it != step_it->second.build_status.end()) {
|
||||
status_it->view = std::move(view);
|
||||
}
|
||||
}
|
||||
|
||||
void view_builder::on_update_view(const sstring& ks_name, const sstring& view_name, bool) {
|
||||
// Do it in the background, serialized.
|
||||
(void)with_semaphore(_sem, view_builder_semaphore_units, [ks_name, view_name, this] {
|
||||
auto view = view_ptr(_db.find_schema(ks_name, view_name));
|
||||
auto step_it = _base_to_build_step.find(view->view_info()->base_id());
|
||||
if (step_it == _base_to_build_step.end()) {
|
||||
return;// In case all the views for this CF have finished building already.
|
||||
static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
|
||||
return dispatch_update_view(std::move(ks_name), std::move(view_name));
|
||||
}).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (const seastar::gate_closed_exception&) {
|
||||
vlogger.warn("Ignoring gate_closed_exception during view update {}.{}", ks_name, view_name);
|
||||
} catch (const seastar::broken_named_semaphore&) {
|
||||
vlogger.warn("Ignoring broken_named_semaphore during view update {}.{}", ks_name, view_name);
|
||||
} catch (const replica::no_such_column_family&) {
|
||||
vlogger.warn("Ignoring no_such_column_family during view update {}.{}", ks_name, view_name);
|
||||
}
|
||||
auto status_it = std::ranges::find_if(step_it->second.build_status, [view] (const view_build_status& bs) {
|
||||
return bs.view->id() == view->id();
|
||||
});
|
||||
if (status_it != step_it->second.build_status.end()) {
|
||||
status_it->view = std::move(view);
|
||||
}
|
||||
}).handle_exception_type([] (replica::no_such_column_family&) { });
|
||||
}));
|
||||
}
|
||||
|
||||
future<> view_builder::dispatch_drop_view(sstring ks_name, sstring view_name) {
|
||||
@@ -2827,7 +2846,9 @@ void view_builder::on_drop_view(const sstring& ks_name, const sstring& view_name
|
||||
}
|
||||
|
||||
// Do it in the background, serialized and broadcast from shard 0.
|
||||
static_cast<void>(dispatch_drop_view(ks_name, view_name).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
|
||||
static_cast<void>(with_gate(_ops_gate, [this, ks_name = ks_name, view_name = view_name] () mutable {
|
||||
return dispatch_drop_view(std::move(ks_name), std::move(view_name));
|
||||
}).handle_exception([ks_name, view_name] (std::exception_ptr ep) {
|
||||
vlogger.warn("Failed to dispatch view drop {}.{}: {}", ks_name, view_name, ep);
|
||||
}));
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
@@ -190,6 +191,7 @@ class view_builder final : public service::migration_listener::only_view_notific
|
||||
// Guard the whole startup routine with a semaphore so that it's not intercepted by
|
||||
// `on_drop_view`, `on_create_view`, or `on_update_view` events.
|
||||
seastar::named_semaphore _sem{view_builder_semaphore_units, named_semaphore_exception_factory{"view builder"}};
|
||||
seastar::gate _ops_gate;
|
||||
seastar::abort_source _as;
|
||||
future<> _step_fiber = make_ready_future<>();
|
||||
// Used to coordinate between shards the conclusion of the build process for a particular view.
|
||||
@@ -284,6 +286,7 @@ private:
|
||||
future<> mark_as_built(view_ptr);
|
||||
void setup_metrics();
|
||||
future<> dispatch_create_view(sstring ks_name, sstring view_name);
|
||||
future<> dispatch_update_view(sstring ks_name, sstring view_name);
|
||||
future<> dispatch_drop_view(sstring ks_name, sstring view_name);
|
||||
future<> handle_seed_view_build_progress(const sstring& ks_name, const sstring& view_name);
|
||||
future<> handle_create_view_local(const sstring& ks_name, const sstring& view_name, view_builder_units_opt units);
|
||||
|
||||
2
dist/docker/redhat/README.md
vendored
2
dist/docker/redhat/README.md
vendored
@@ -12,7 +12,7 @@ Do the following in the top-level Scylla source directory:
|
||||
2. Run `ninja dist-dev` (with the same mode name as above) to prepare
|
||||
the distribution artifacts.
|
||||
|
||||
3. Run `./dist/docker/debian/build_docker.sh --mode dev`
|
||||
3. Run `./dist/docker/redhat/build_docker.sh --mode dev`
|
||||
|
||||
This creates a docker image as a **file**, in the OCI format, and prints
|
||||
its name, looking something like:
|
||||
|
||||
2
dist/docker/redhat/build_docker.sh
vendored
2
dist/docker/redhat/build_docker.sh
vendored
@@ -70,7 +70,7 @@ bcp() { buildah copy "$container" "$@"; }
|
||||
run() { buildah run "$container" "$@"; }
|
||||
bconfig() { buildah config "$@" "$container"; }
|
||||
|
||||
container="$(buildah from docker.io/redhat/ubi9-minimal:latest)"
|
||||
container="$(buildah from --pull=always docker.io/redhat/ubi9-minimal:latest)"
|
||||
|
||||
packages=(
|
||||
"build/dist/$config/redhat/RPMS/$arch/$product-$version-$release.$arch.rpm"
|
||||
|
||||
@@ -142,10 +142,6 @@ want modify a non-top-level attribute directly (e.g., a.b[3].c) need RMW:
|
||||
Alternator implements such requests by reading the entire top-level
|
||||
attribute a, modifying only a.b[3].c, and then writing back a.
|
||||
|
||||
Currently, Alternator doesn't use Tablets. That's because Alternator relies
|
||||
on LWT (lightweight transactions), and LWT is not supported in keyspaces
|
||||
with Tablets enabled.
|
||||
|
||||
```{eval-rst}
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
@@ -213,3 +213,71 @@ Alternator table, the following features will not work for this table:
|
||||
* Enabling Streams with CreateTable or UpdateTable doesn't work
|
||||
(results in an error).
|
||||
See <https://github.com/scylladb/scylla/issues/23838>.
|
||||
|
||||
## Custom write timestamps
|
||||
|
||||
DynamoDB doesn't allow clients to set the write timestamp of updates. All
|
||||
updates use the current server time as their timestamp, and ScyllaDB uses
|
||||
these timestamps for last-write-wins conflict resolution when concurrent
|
||||
writes reach different replicas.
|
||||
|
||||
ScyllaDB Alternator extends this with the `system:timestamp_attribute` tag,
|
||||
which allows specifying a custom write timestamp for each PutItem,
|
||||
UpdateItem, DeleteItem, or BatchWriteItem request. To use this feature:
|
||||
|
||||
1. Tag the table (at CreateTable time or using TagResource) with
|
||||
`system:timestamp_attribute` set to the name of an attribute that will
|
||||
hold the custom write timestamp.
|
||||
|
||||
2. When performing a PutItem or UpdateItem, include the named attribute
|
||||
in the request with a numeric value. The value represents the write
|
||||
timestamp in **microseconds since the Unix epoch** (this is the same
|
||||
unit used internally by ScyllaDB for timestamps).
|
||||
For a DeleteItem or a BatchWriteItem DeleteRequest, include the named
|
||||
attribute in the `Key` parameter (it will be stripped from the key
|
||||
before use).
|
||||
|
||||
3. The named attribute is **not stored** in the item data - it only
|
||||
controls the write timestamp. If you also want to record the timestamp
|
||||
as data, use a separate attribute for that purpose.
|
||||
|
||||
4. If the named attribute is absent, the write proceeds normally using the
|
||||
current server time as the timestamp. If the named attribute is present
|
||||
but has a non-numeric value, the write is rejected with a ValidationException.
|
||||
|
||||
### Limitations
|
||||
|
||||
- **Incompatible with conditions**: If the write includes a ConditionExpression
|
||||
(or uses the `Expected` legacy condition), LWT is needed and the operation
|
||||
is rejected with a ValidationException, because LWT requires the write
|
||||
timestamp to be set by the Paxos protocol, not by the client.
|
||||
|
||||
- **Incompatible with `always` write isolation**: Tables using the `always`
|
||||
(or `always_use_lwt`) write isolation policy cannot use the timestamp
|
||||
attribute feature at all, because every write uses LWT in that mode.
|
||||
When using `system:timestamp_attribute`, consider tagging the table with
|
||||
`system:write_isolation=only_rmw_uses_lwt` (or `forbid_rmw`) so that
|
||||
unconditional writes do not use LWT.
|
||||
|
||||
### Example use case
|
||||
|
||||
This feature is useful for ingesting data from multiple sources where each
|
||||
record has a known logical timestamp. By setting the `system:timestamp_attribute`
|
||||
tag, you can ensure that the record with the highest logical timestamp always
|
||||
wins, regardless of ingestion order:
|
||||
|
||||
```python
|
||||
# Create table with timestamp attribute
|
||||
dynamodb.create_table(
|
||||
TableName='my_table',
|
||||
...
|
||||
Tags=[{'Key': 'system:timestamp_attribute', 'Value': 'write_ts'}]
|
||||
)
|
||||
|
||||
# Write a record with a specific timestamp (in microseconds since epoch)
|
||||
table.put_item(Item={
|
||||
'pk': 'my_key',
|
||||
'data': 'new_value',
|
||||
'write_ts': Decimal('1700000000000000'), # Nov 14, 2023 in microseconds
|
||||
})
|
||||
```
|
||||
|
||||
@@ -187,6 +187,23 @@ You can create a keyspace with tablets enabled with the ``tablets = {'enabled':
|
||||
the keyspace schema with ``tablets = { 'enabled': false }`` or
|
||||
``tablets = { 'enabled': true }``.
|
||||
|
||||
.. _keyspace-rf-rack-valid-to-enforce-rack-list:
|
||||
|
||||
Enforcing Rack-List Replication for Tablet Keyspaces
|
||||
------------------------------------------------------------------
|
||||
|
||||
The ``rf_rack_valid_keyspaces`` is a legacy option that ensures that all keyspaces with tablets enabled are
|
||||
:term:`RF-rack-valid <RF-rack-valid keyspace>`.
|
||||
|
||||
Requiring every tablet keyspace to use the rack list replication factor exclusively is enough to guarantee the keyspace is
|
||||
:term:`RF-rack-valid <RF-rack-valid keyspace>`. It reduces restrictions and provides stronger guarantees compared
|
||||
to ``rf_rack_valid_keyspaces`` option.
|
||||
|
||||
To enforce rack list in tablet keyspaces, use ``enforce_rack_list`` option. It can be set only if all tablet keyspaces use
|
||||
rack list. To ensure that, follow a procedure of :ref:`conversion to rack list replication factor <conversion-to-rack-list-rf>`.
|
||||
After that restart all nodes in the cluster, with ``enforce_rack_list`` enabled and ``rf_rack_valid_keyspaces`` disabled. Make
|
||||
sure to avoid setting or updating replication factor (with CREATE KEYSPACE or ALTER KEYSPACE) while nodes are being restarted.
|
||||
|
||||
.. _tablets-limitations:
|
||||
|
||||
Limitations and Unsupported Features
|
||||
|
||||
@@ -200,8 +200,6 @@ for two cases. One is setting replication factor to 0, in which case the number
|
||||
The other is when the numeric replication factor is equal to the current number of replicas
|
||||
for a given datacanter, in which case the current rack list is preserved.
|
||||
|
||||
Altering from a numeric replication factor to a rack list is not supported yet.
|
||||
|
||||
Note that when ``ALTER`` ing keyspaces and supplying ``replication_factor``,
|
||||
auto-expansion will only *add* new datacenters for safety, it will not alter
|
||||
existing datacenters or remove any even if they are no longer in the cluster.
|
||||
@@ -424,6 +422,21 @@ Altering from a rack list to a numeric replication factor is not supported.
|
||||
|
||||
Keyspaces which use rack lists are :term:`RF-rack-valid <RF-rack-valid keyspace>` if each rack in the rack list contains at least one node (excluding :doc:`zero-token nodes </architecture/zero-token-nodes>`).
|
||||
|
||||
.. _conversion-to-rack-list-rf:
|
||||
|
||||
Conversion to rack-list replication factor
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To migrate a keyspace from a numeric replication factor to a rack-list replication factor, provide the rack-list replication factor explicitly in ALTER KEYSPACE statement. The number of racks in the list must be equal to the numeric replication factor. The replication factor can be converted in any number of DCs at once. In a statement that converts replication factor, no replication factor updates (increase or decrease) are allowed in any DC.
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
CREATE KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
|
||||
|
||||
ALTER KEYSPACE Excelsior
|
||||
WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. _drop-keyspace-statement:
|
||||
|
||||
DROP KEYSPACE
|
||||
|
||||
@@ -108,6 +108,4 @@ check the statement and throw if it is disallowed, similar to what
|
||||
|
||||
Obviously, an audit definition must survive a server restart and stay
|
||||
consistent among all nodes in a cluster. We'll accomplish both by
|
||||
storing audits in a system table. They will be cached in memory the
|
||||
same way `permissions_cache` caches table contents in `permission_set`
|
||||
objects resident in memory.
|
||||
storing audits in a system table.
|
||||
|
||||
@@ -39,6 +39,17 @@ Both client and server use the same string identifiers for the keys to determine
|
||||
negotiated extension set, judging by the presence of a particular key in the
|
||||
SUPPORTED/STARTUP messages.
|
||||
|
||||
## Client options
|
||||
|
||||
`client_options` column in `system.clients` table stores all data sent by the
|
||||
client in STARTUP request, as a `map<text, text>`. This column may be useful
|
||||
for debugging and monitoring purposes.
|
||||
|
||||
Drivers can send additional data in STARTUP, e.g. load balancing policy, retry
|
||||
policy, timeouts, and other configuration.
|
||||
Such data should be sent in `CLIENT_OPTIONS` key, as JSON. The recommended
|
||||
structure of this JSON will be decided in the future.
|
||||
|
||||
## Intranode sharding
|
||||
|
||||
This extension allows the driver to discover how Scylla internally
|
||||
@@ -74,8 +85,6 @@ The keys and values are:
|
||||
as an indicator to which shard client wants to connect. The desired shard number
|
||||
is calculated as: `desired_shard_no = client_port % SCYLLA_NR_SHARDS`.
|
||||
Its value is a decimal representation of type `uint16_t`, by default `19142`.
|
||||
- `CLIENT_OPTIONS` is a string containing a JSON object representation that
|
||||
contains CQL Driver configuration, e.g. load balancing policy, retry policy, timeouts, etc.
|
||||
|
||||
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
|
||||
`biased-token-round-robin`. To apply the algorithm,
|
||||
|
||||
@@ -563,17 +563,18 @@ CREATE TABLE system.clients (
|
||||
address inet,
|
||||
port int,
|
||||
client_type text,
|
||||
client_options frozen<map<text, text>>,
|
||||
connection_stage text,
|
||||
driver_name text,
|
||||
driver_version text,
|
||||
hostname text,
|
||||
protocol_version int,
|
||||
scheduling_group text,
|
||||
shard_id int,
|
||||
ssl_cipher_suite text,
|
||||
ssl_enabled boolean,
|
||||
ssl_protocol text,
|
||||
username text,
|
||||
scheduling_group text,
|
||||
PRIMARY KEY (address, port, client_type)
|
||||
) WITH CLUSTERING ORDER BY (port ASC, client_type ASC)
|
||||
~~~
|
||||
@@ -581,4 +582,7 @@ CREATE TABLE system.clients (
|
||||
Currently only CQL clients are tracked. The table used to be present on disk (in data
|
||||
directory) before and including version 4.5.
|
||||
|
||||
`client_options` column stores all data sent by the client in the STARTUP request.
|
||||
This column is useful for debugging and monitoring purposes.
|
||||
|
||||
## TODO: the rest
|
||||
|
||||
@@ -10,7 +10,6 @@ Install ScyllaDB |CURRENT_VERSION|
|
||||
/getting-started/install-scylla/launch-on-azure
|
||||
/getting-started/installation-common/scylla-web-installer
|
||||
/getting-started/install-scylla/install-on-linux
|
||||
/getting-started/installation-common/install-jmx
|
||||
/getting-started/install-scylla/run-in-docker
|
||||
/getting-started/installation-common/unified-installer
|
||||
/getting-started/installation-common/air-gapped-install
|
||||
@@ -24,9 +23,9 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
:id: "getting-started"
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Launch ScyllaDB |CURRENT_VERSION| on AWS </getting-started/install-scylla/launch-on-aws>`
|
||||
* :doc:`Launch ScyllaDB |CURRENT_VERSION| on GCP </getting-started/install-scylla/launch-on-gcp>`
|
||||
* :doc:`Launch ScyllaDB |CURRENT_VERSION| on Azure </getting-started/install-scylla/launch-on-azure>`
|
||||
* :doc:`Launch ScyllaDB on AWS </getting-started/install-scylla/launch-on-aws>`
|
||||
* :doc:`Launch ScyllaDB on GCP </getting-started/install-scylla/launch-on-gcp>`
|
||||
* :doc:`Launch ScyllaDB on Azure </getting-started/install-scylla/launch-on-azure>`
|
||||
|
||||
|
||||
.. panel-box::
|
||||
@@ -35,8 +34,7 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
|
||||
:class: my-panel
|
||||
|
||||
* :doc:`Install ScyllaDB with Web Installer (recommended) </getting-started/installation-common/scylla-web-installer>`
|
||||
* :doc:`Install ScyllaDB |CURRENT_VERSION| Linux Packages </getting-started/install-scylla/install-on-linux>`
|
||||
* :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
* :doc:`Install ScyllaDB Linux Packages </getting-started/install-scylla/install-on-linux>`
|
||||
* :doc:`Install ScyllaDB Without root Privileges </getting-started/installation-common/unified-installer>`
|
||||
* :doc:`Air-gapped Server Installation </getting-started/installation-common/air-gapped-install>`
|
||||
* :doc:`ScyllaDB Developer Mode </getting-started/installation-common/dev-mod>`
|
||||
|
||||
@@ -94,16 +94,6 @@ Install ScyllaDB
|
||||
|
||||
apt-get install scylla{,-server,-kernel-conf,-node-exporter,-conf,-python3,-cqlsh}=2025.3.1-0.20250907.2bbf3cf669bb-1
|
||||
|
||||
|
||||
#. (Ubuntu only) Set Java 11.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y openjdk-11-jre-headless
|
||||
sudo update-java-alternatives --jre-headless -s java-1.11.0-openjdk-amd64
|
||||
|
||||
|
||||
.. group-tab:: Centos/RHEL
|
||||
|
||||
#. Install the EPEL repository.
|
||||
@@ -157,14 +147,6 @@ Install ScyllaDB
|
||||
|
||||
sudo yum install scylla-5.2.3
|
||||
|
||||
(Optional) Install scylla-jmx
|
||||
-------------------------------
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
|
||||
|
||||
|
||||
|
||||
.. include:: /getting-started/_common/setup-after-install.rst
|
||||
|
||||
Next Steps
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
|
||||
======================================
|
||||
Install scylla-jmx Package
|
||||
======================================
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, you can still install it from scylla-jmx GitHub page.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Debian/Ubuntu
|
||||
#. Download .deb package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".deb".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo apt install -y ./scylla-jmx_<version>_all.deb
|
||||
|
||||
|
||||
.. group-tab:: Centos/RHEL
|
||||
|
||||
#. Download .rpm package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".rpm".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
sudo yum install -y ./scylla-jmx-<version>.noarch.rpm
|
||||
|
||||
|
||||
.. group-tab:: Install without root privileges
|
||||
|
||||
#. Download .tar.gz package from scylla-jmx page.
|
||||
|
||||
Access to https://github.com/scylladb/scylla-jmx, select latest
|
||||
release from "releases", download a file end with ".tar.gz".
|
||||
|
||||
#. (Optional) Transfer the downloaded package to the install node.
|
||||
|
||||
If the pc from which you downloaded the package is different from
|
||||
the node where you install scylladb, you will need to transfer
|
||||
the files to the node.
|
||||
|
||||
#. Install scylla-jmx package.
|
||||
|
||||
.. code:: console
|
||||
|
||||
tar xpf scylla-jmx-<version>.noarch.tar.gz
|
||||
cd scylla-jmx
|
||||
./install.sh --nonroot
|
||||
|
||||
Next Steps
|
||||
-----------
|
||||
|
||||
* :doc:`Configure ScyllaDB </getting-started/system-configuration>`
|
||||
* Manage your clusters with `ScyllaDB Manager <https://manager.docs.scylladb.com/>`_
|
||||
* Monitor your cluster and data with `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_
|
||||
* Get familiar with ScyllaDB’s :doc:`command line reference guide </operating-scylla/nodetool>`.
|
||||
* Learn about ScyllaDB at `ScyllaDB University <https://university.scylladb.com/>`_
|
||||
@@ -49,11 +49,6 @@ Download and Install
|
||||
|
||||
./install.sh --nonroot --python3 ~/scylladb/python3/bin/python3
|
||||
|
||||
#. (Optional) Install scylla-jmx
|
||||
|
||||
scylla-jmx is an optional package and is not installed by default.
|
||||
If you need JMX server, see :doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`.
|
||||
|
||||
Configure and Run ScyllaDB
|
||||
----------------------------
|
||||
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
====================================================
|
||||
Increase Permission Cache to Avoid Non-paged Queries
|
||||
====================================================
|
||||
|
||||
**Topic: Mitigate non-paged queries coming from connection authentications**
|
||||
|
||||
**Audience: ScyllaDB administrators**
|
||||
|
||||
|
||||
|
||||
Issue
|
||||
-----
|
||||
|
||||
If you create lots of roles and give them lots of permissions your nodes might spike with non-paged queries.
|
||||
|
||||
Root Cause
|
||||
----------
|
||||
|
||||
``permissions_cache_max_entries`` is set to 1000 by default. This setting may not be high enough for bigger deployments with lots of tables, users, and roles with permissions.
|
||||
|
||||
|
||||
Solution
|
||||
--------
|
||||
|
||||
Open the scylla.yaml configuration for editing and adjust the following parameters:
|
||||
``permissions_cache_max_entries`` - increase this value to suit your needs. See the example below.
|
||||
``permissions_update_interval_in_ms``
|
||||
``permissions_validity_in_ms``
|
||||
|
||||
Note:: ``permissions_update_interval_in_ms`` and ``permissions_validity_in_ms`` can be set to also make the authentication records come from cache instead of lookups, which generate non-paged queries
|
||||
|
||||
|
||||
Example
|
||||
-------
|
||||
|
||||
Considering with ``permissions_cache_max_entries`` there is no maximum value, it's just limited by your memory.
|
||||
The cache consumes memory as it caches all records from the list of users and their associated roles (similar to a cartesian product).
|
||||
|
||||
Every user, role, and permissions(7 types) on a per table basis are cached.
|
||||
|
||||
If for example, you have 1 user with 1 role and 1 table, the table will have 7 permission types and 7 entries 1 * 1 * 1 * 7 = 7.
|
||||
When expanded to 5 users, 5 roles, and 10 tables this will be 5 * 5 * 10 * 7 = 1750 entries, which is above the default cache value of 1000. The entries that go over the max value (750 entries) will be non-paged queries for every new connection from the client (and clients tend to reconnect often).
|
||||
In cases like this, you may want to consider trading your memory for not stressing the entire cluster with ``auth`` queries.
|
||||
@@ -38,7 +38,6 @@ Knowledge Base
|
||||
* :doc:`If a query does not reveal enough results </kb/cqlsh-results>`
|
||||
* :doc:`How to Change gc_grace_seconds for a Table </kb/gc-grace-seconds>` - How to change the ``gc_grace_seconds`` parameter and prevent data resurrection.
|
||||
* :doc:`How to flush old tombstones from a table </kb/tombstones-flush>` - How to remove old tombstones from SSTables.
|
||||
* :doc:`Increase Cache to Avoid Non-paged Queries </kb/increase-permission-cache>` - How to increase the ``permissions_cache_max_entries`` setting.
|
||||
* :doc:`How to Safely Increase the Replication Factor </kb/rf-increase>`
|
||||
* :doc:`Facts about TTL, Compaction, and gc_grace_seconds <ttl-facts>`
|
||||
* :doc:`Efficient Tombstone Garbage Collection in ICS <garbage-collection-ics>`
|
||||
|
||||
@@ -25,7 +25,8 @@ Before you run ``nodetool decommission``:
|
||||
starting the removal procedure.
|
||||
* Make sure that the number of nodes remaining in the DC after you decommission a node
|
||||
will be the same or higher than the Replication Factor configured for the keyspace
|
||||
in this DC. If the number of remaining nodes is lower than the RF, the decommission
|
||||
in this DC. Please mind that e.g. audit feature, which is enabled by default, may require
|
||||
adjusting ``audit`` keyspace. If the number of remaining nodes is lower than the RF, the decommission
|
||||
request may fail.
|
||||
In such a case, ALTER the keyspace to reduce the RF before running ``nodetool decommission``.
|
||||
|
||||
|
||||
@@ -25,4 +25,8 @@ For Example:
|
||||
|
||||
nodetool rebuild <source-dc-name>
|
||||
|
||||
``nodetool rebuild`` command works only for vnode keyspaces. For tablet keyspaces, use ``nodetool cluster repair`` instead.
|
||||
|
||||
See :doc:`Data Distribution with Tablets </architecture/tablets/>`.
|
||||
|
||||
.. include:: nodetool-index.rst
|
||||
|
||||
@@ -155,7 +155,6 @@ Add New DC
|
||||
UN 54.235.9.159 109.75 KB 256 ? 39798227-9f6f-4868-8193-08570856c09a RACK1
|
||||
UN 54.146.228.25 128.33 KB 256 ? 7a4957a1-9590-4434-9746-9c8a6f796a0c RACK1
|
||||
|
||||
.. TODO possibly provide additional information WRT how ALTER works with tablets
|
||||
|
||||
#. When all nodes are up and running ``ALTER`` the following Keyspaces in the new nodes:
|
||||
|
||||
@@ -171,26 +170,68 @@ Add New DC
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3};
|
||||
|
||||
ALTER Command
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
ALTER KEYSPACE mykeyspace WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
ALTER KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
ALTER KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace;
|
||||
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class’: 'NetworkTopologyStrategy', <exiting_dc>:3, <new_dc>: 3};
|
||||
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<exiting_dc>' : 3, <new_dc> : 3};
|
||||
CREATE KEYSPACE mykeyspace WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
CREATE KEYSPACE system_distributed WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
CREATE KEYSPACE system_traces WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3};
|
||||
|
||||
#. Run ``nodetool rebuild`` on each node in the new datacenter, specify the existing datacenter name in the rebuild command.
|
||||
For tablet keyspaces, update the replication factor one by one:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace2;
|
||||
|
||||
CREATE KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 1} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 2} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace2 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : 3, '<new_dc>' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that a new DC (rack) can be added. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to add a datacenter:
|
||||
|
||||
Before
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace3;
|
||||
|
||||
CREATE KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Add all the nodes to the new datacenter and then alter the keyspace one by one:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>']} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>']} AND tablets = { 'enabled': true };
|
||||
ALTER KEYSPACE mykeyspace3 WITH replication = { 'class' : 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
After
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
DESCRIBE KEYSPACE mykeyspace3;
|
||||
CREATE KEYSPACE mykeyspace3 WITH REPLICATION = {'class': 'NetworkTopologyStrategy', '<existing_dc>' : ['<existing_rack1>', '<existing_rack2>', '<existing_rack3>'], '<new_dc>' : ['<new_rack1>', '<new_rack2>', '<new_rack3>']} AND tablets = { 'enabled': true };
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
#. If any vnode keyspace was altered, run ``nodetool rebuild`` on each node in the new datacenter, specifying the existing datacenter name in the rebuild command.
|
||||
|
||||
For example:
|
||||
|
||||
@@ -198,7 +239,7 @@ Add New DC
|
||||
|
||||
The rebuild ensures that the new nodes that were just added to the cluster will recognize the existing datacenters in the cluster.
|
||||
|
||||
#. Run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
|
||||
#. If any vnode keyspace was altered, run a full cluster repair, using :doc:`nodetool repair -pr </operating-scylla/nodetool-commands/repair>` on each node, or using `ScyllaDB Manager ad-hoc repair <https://manager.docs.scylladb.com/stable/repair>`_
|
||||
|
||||
#. If you are using ScyllaDB Monitoring, update the `monitoring stack <https://monitoring.docs.scylladb.com/stable/install/monitoring_stack.html#configure-scylla-nodes-from-files>`_ to monitor it. If you are using ScyllaDB Manager, make sure you install the `Manager Agent <https://manager.docs.scylladb.com/stable/install-scylla-manager-agent.html>`_ and Manager can access the new DC.
|
||||
|
||||
|
||||
@@ -40,12 +40,14 @@ Prerequisites
|
||||
Procedure
|
||||
---------
|
||||
|
||||
#. Run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
|
||||
#. If there are vnode keyspaces in this DC, run the ``nodetool repair -pr`` command on each node in the data-center that is going to be decommissioned. This will verify that all the data is in sync between the decommissioned data-center and the other data-centers in the cluster.
|
||||
|
||||
For example:
|
||||
|
||||
If the ASIA-DC cluster is to be removed, then, run the ``nodetool repair -pr`` command on all the nodes in the ASIA-DC
|
||||
|
||||
#. If there are tablet keyspaces in this DC, run the ``nodetool cluster repair`` on an arbitrary node. The reason for running repair is to ensure that any updates stored only on the about-to-be-decommissioned replicas are propagated to the other replicas, before the replicas on the decommissioned datacenter are dropped.
|
||||
|
||||
#. ALTER every cluster KEYSPACE, so that the keyspaces will no longer replicate data to the decommissioned data-center.
|
||||
|
||||
For example:
|
||||
@@ -73,6 +75,44 @@ Procedure
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
|
||||
|
||||
For tablet keyspaces, update the replication factor one by one:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba2
|
||||
cqlsh> CREATE KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 2, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 1, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
cqlsh> ALTER KEYSPACE nba2 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3} AND tablets = { 'enabled': true };
|
||||
|
||||
.. note::
|
||||
If ``rf_rack_valid_keyspaces`` option is set, a tablet keyspace needs to use rack list replication factor, so that the DC can be removed. See :ref:`the conversion procedure <conversion-to-rack-list-rf>`. In this case, to remove a datacenter:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> DESCRIBE nba3
|
||||
cqlsh> CREATE KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4', 'RAC5'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : ['RAC4'], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
cqlsh> ALTER KEYSPACE nba3 WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : ['RAC1', 'RAC2', 'RAC3'], 'ASIA-DC' : [], 'EUROPE-DC' : ['RAC6', 'RAC7', 'RAC8']} AND tablets = { 'enabled': true };
|
||||
|
||||
Consider :ref:`upgrading rf_rack_valid_keyspaces option to enforce_rack_list option <keyspace-rf-rack-valid-to-enforce-rack-list>` to ensure all tablet keyspaces use rack lists.
|
||||
|
||||
.. note::
|
||||
|
||||
If table audit is enabled, the ``audit`` keyspace is automatically created with ``NetworkTopologyStrategy``.
|
||||
You must also alter the ``audit`` keyspace to remove replicas from the decommissioned data-center. For example:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
cqlsh> ALTER KEYSPACE audit WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'US-DC' : 3, 'ASIA-DC' : 0, 'EUROPE-DC' : 3};
|
||||
|
||||
Failure to do so will result in decommission errors such as "zero replica after the removal".
|
||||
|
||||
#. Run :doc:`nodetool decommission </operating-scylla/nodetool-commands/decommission>` on every node in the data center that is to be removed.
|
||||
Refer to :doc:`Remove a Node from a ScyllaDB Cluster - Down Scale </operating-scylla/procedures/cluster-management/remove-node>` for further information.
|
||||
|
||||
|
||||
@@ -14,11 +14,11 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
|
||||
Enabling Audit
|
||||
---------------
|
||||
|
||||
By default, auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
|
||||
By default, table auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
|
||||
You can set the following options:
|
||||
|
||||
* ``none`` - Audit is disabled (default).
|
||||
* ``table`` - Audit is enabled, and messages are stored in a Scylla table.
|
||||
* ``none`` - Audit is disabled.
|
||||
* ``table`` - Audit is enabled, and messages are stored in a Scylla table (default).
|
||||
* ``syslog`` - Audit is enabled, and messages are sent to Syslog.
|
||||
* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.
|
||||
|
||||
@@ -32,7 +32,7 @@ The audit can be tuned using the following flags or ``scylla.yaml`` entries:
|
||||
================== ================================== ========================================================================================================================
|
||||
Flag Default Value Description
|
||||
================== ================================== ========================================================================================================================
|
||||
audit_categories "DCL,DDL,AUTH,ADMIN" Comma-separated list of statement categories that should be audited
|
||||
audit_categories "DCL,AUTH,ADMIN" Comma-separated list of statement categories that should be audited
|
||||
------------------ ---------------------------------- ------------------------------------------------------------------------------------------------------------------------
|
||||
audit_tables “” Comma-separated list of table names that should be audited, in the format of <keyspacename>.<tablename>
|
||||
------------------ ---------------------------------- ------------------------------------------------------------------------------------------------------------------------
|
||||
@@ -86,9 +86,7 @@ Storing Audit Messages in Syslog
|
||||
.. code-block:: shell
|
||||
|
||||
# audit setting
|
||||
# by default, Scylla does not audit anything.
|
||||
# It is possible to enable auditing to the following places:
|
||||
# - audit.audit_log column family by setting the flag to "table"
|
||||
# 'audit' config option controls if and where to output audited events:
|
||||
audit: "syslog"
|
||||
#
|
||||
# List of statement categories that should be audited.
|
||||
@@ -159,9 +157,7 @@ For example:
|
||||
.. code-block:: shell
|
||||
|
||||
# audit setting
|
||||
# by default, Scylla does not audit anything.
|
||||
# It is possible to enable auditing to the following places:
|
||||
# - audit.audit_log column family by setting the flag to "table"
|
||||
# 'audit' config option controls if and where to output audited events:
|
||||
audit: "table"
|
||||
#
|
||||
# List of statement categories that should be audited.
|
||||
@@ -215,8 +211,8 @@ Handling Audit Failures
|
||||
|
||||
In some cases, auditing may not be possible, for example, when:
|
||||
|
||||
* A table is used as the audit’s backend, and the audit partition where the audit row is saved is not available because the node that holds this partition is down.
|
||||
* Syslog is used as the audit’s backend, and the Syslog sink (a regular unix socket) is unresponsive/unavailable.
|
||||
* A table is used as the audit’s backend, and the partitions where the audit rows are saved are unavailable because the nodes holding those partitions are down or unreachable due to network issues.
|
||||
* Syslog is used as the audit’s backend, and the Syslog sink (a regular Unix socket) is unresponsive or unavailable.
|
||||
|
||||
If the audit fails and audit messages are not stored in the configured audit’s backend, you can still review the audit log in the regular ScyllaDB logs.
|
||||
|
||||
|
||||
@@ -199,9 +199,6 @@ You should take note of the current version in case you want to |ROLLBACK|_ the
|
||||
#. Run ``sudo /opt/scylladb/scylla-machine-image/scylla_cloud_io_setup``.
|
||||
|
||||
|
||||
If you need JMX server, see
|
||||
:doc:`Install scylla-jmx Package </getting-started/installation-common/install-jmx>`
|
||||
and get new version.
|
||||
|
||||
Start the node
|
||||
--------------
|
||||
|
||||
@@ -284,6 +284,7 @@ future<rjson::value> encryption::gcp_host::impl::gcp_auth_post_with_retry(std::s
|
||||
}
|
||||
[[fallthrough]];
|
||||
case httpclient::reply_status::request_timeout:
|
||||
case httpclient::reply_status::too_many_requests:
|
||||
if (retry < max_retries) {
|
||||
// service unavailable etc -> backoff + retry
|
||||
do_backoff = true;
|
||||
|
||||
@@ -23,11 +23,11 @@ static_assert(std::is_nothrow_move_constructible_v<gms::inet_address>);
|
||||
|
||||
future<gms::inet_address> gms::inet_address::lookup(sstring name, opt_family family, opt_family preferred) {
|
||||
return seastar::net::dns::get_host_by_name(std::move(name), family).then([preferred](seastar::net::hostent&& h) {
|
||||
for (auto& addr : h.addr_list) {
|
||||
if (!preferred || addr.in_family() == preferred) {
|
||||
return gms::inet_address(addr);
|
||||
for (auto& ent : h.addr_entries) {
|
||||
if (!preferred || ent.addr.in_family() == preferred) {
|
||||
return gms::inet_address(ent.addr);
|
||||
}
|
||||
}
|
||||
return gms::inet_address(h.addr_list.front());
|
||||
return gms::inet_address(h.addr_entries.front().addr);
|
||||
});
|
||||
}
|
||||
|
||||
33
lang/lua.cc
33
lang/lua.cc
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <boost/date_time/gregorian/greg_date.hpp>
|
||||
#include <boost/date_time/posix_time/posix_time.hpp>
|
||||
#include <random>
|
||||
#include "lua.hh"
|
||||
#include "lang/lua_scylla_types.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
@@ -28,6 +29,14 @@
|
||||
# define LUA_504_PLUS(x...)
|
||||
#endif
|
||||
|
||||
// Lua 5.5 added a seed parameter to lua_newstate
|
||||
|
||||
#if LUA_VERSION_NUM >= 505
|
||||
# define LUA_505_PLUS(x...) x
|
||||
#else
|
||||
# define LUA_505_PLUS(x...)
|
||||
#endif
|
||||
|
||||
using namespace seastar;
|
||||
using namespace lua;
|
||||
|
||||
@@ -126,7 +135,11 @@ static void debug_hook(lua_State* l, lua_Debug* ar) {
|
||||
|
||||
static lua_slice_state new_lua(const lua::runtime_config& cfg) {
|
||||
auto a_state = std::make_unique<alloc_state>(cfg.max_bytes, cfg.max_contiguous);
|
||||
std::unique_ptr<lua_State, lua_closer> l{lua_newstate(lua_alloc, a_state.get())};
|
||||
#if LUA_VERSION_NUM >= 505
|
||||
static thread_local std::default_random_engine rng{std::random_device{}()};
|
||||
auto seed = rng();
|
||||
#endif
|
||||
std::unique_ptr<lua_State, lua_closer> l{lua_newstate(lua_alloc, a_state.get() LUA_505_PLUS(, seed))};
|
||||
if (!l) {
|
||||
throw std::runtime_error("could not create lua state");
|
||||
}
|
||||
@@ -270,17 +283,6 @@ concept CanHandleLuaTypes = requires(Func f) {
|
||||
{ f(*static_cast<const lua_table*>(nullptr)) } -> std::same_as<lua_visit_ret_type<Func>>;
|
||||
};
|
||||
|
||||
// This is used to test if a double fits in a long long, so
|
||||
// we expect overflows. Prevent the sanitizer from complaining.
|
||||
#ifdef __clang__
|
||||
[[clang::no_sanitize("undefined")]]
|
||||
#endif
|
||||
static
|
||||
long long
|
||||
cast_to_long_long_allow_overflow(double v) {
|
||||
return (long long)v;
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
requires CanHandleLuaTypes<Func>
|
||||
static auto visit_lua_value(lua_State* l, int index, Func&& f) {
|
||||
@@ -291,9 +293,10 @@ static auto visit_lua_value(lua_State* l, int index, Func&& f) {
|
||||
auto operator()(const long long& v) { return f(utils::multiprecision_int(v)); }
|
||||
auto operator()(const utils::multiprecision_int& v) { return f(v); }
|
||||
auto operator()(const double& v) {
|
||||
long long v2 = cast_to_long_long_allow_overflow(v);
|
||||
if (v2 == v) {
|
||||
return (*this)(v2);
|
||||
auto min = double(std::numeric_limits<long long>::min());
|
||||
auto max = double(std::numeric_limits<long long>::max());
|
||||
if (min <= v && v <= max && std::trunc(v) == v) {
|
||||
return (*this)((long long)v);
|
||||
}
|
||||
// FIXME: We could use frexp to produce a decimal instead of a double
|
||||
return f(v);
|
||||
|
||||
@@ -616,12 +616,16 @@ tablet_replica tablet_map::get_primary_replica(tablet_id id, const locator::topo
|
||||
return maybe_get_primary_replica(id, replicas, topo, [&] (const auto& _) { return true; }).value();
|
||||
}
|
||||
|
||||
tablet_replica tablet_map::get_secondary_replica(tablet_id id) const {
|
||||
if (get_tablet_info(id).replicas.size() < 2) {
|
||||
tablet_replica tablet_map::get_secondary_replica(tablet_id id, const locator::topology& topo) const {
|
||||
const auto& orig_replicas = get_tablet_info(id).replicas;
|
||||
if (orig_replicas.size() < 2) {
|
||||
throw std::runtime_error(format("No secondary replica for tablet id {}", id));
|
||||
}
|
||||
const auto& replicas = get_tablet_info(id).replicas;
|
||||
return replicas.at((size_t(id)+1) % replicas.size());
|
||||
tablet_replica_set replicas = orig_replicas;
|
||||
std::ranges::sort(replicas, tablet_replica_comparator(topo));
|
||||
// This formula must match the one in get_primary_replica(),
|
||||
// just with + 1.
|
||||
return replicas.at((size_t(id) + size_t(id) / replicas.size() + 1) % replicas.size());
|
||||
}
|
||||
|
||||
std::optional<tablet_replica> tablet_map::maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const {
|
||||
|
||||
@@ -648,9 +648,10 @@ public:
|
||||
/// Returns the primary replica for the tablet
|
||||
tablet_replica get_primary_replica(tablet_id id, const locator::topology& topo) const;
|
||||
|
||||
/// Returns the secondary replica for the tablet, which is assumed to be directly following the primary replica in the replicas vector
|
||||
/// Returns the secondary replica for the tablet: the replica that immediately follows the primary
|
||||
/// replica in the topology-sorted replica list.
|
||||
/// \throws std::runtime_error if the tablet has less than 2 replicas.
|
||||
tablet_replica get_secondary_replica(tablet_id id) const;
|
||||
tablet_replica get_secondary_replica(tablet_id id, const locator::topology& topo) const;
|
||||
|
||||
// Returns the replica that matches hosts and dcs filters for tablet_task_info.
|
||||
std::optional<tablet_replica> maybe_get_selected_replica(tablet_id id, const topology& topo, const tablet_task_info& tablet_task_info) const;
|
||||
|
||||
@@ -1170,6 +1170,17 @@ token_metadata::set_version_tracker(version_tracker_t tracker) {
|
||||
_impl->set_version_tracker(std::move(tracker));
|
||||
}
|
||||
|
||||
version_tracker::version_tracker(utils::phased_barrier::operation op, const token_metadata& tm)
|
||||
: _op(std::move(op))
|
||||
, _version(tm.get_version())
|
||||
, _tm(&tm)
|
||||
{
|
||||
}
|
||||
|
||||
long version_tracker::version_use_count() const {
|
||||
return _tm->use_count();
|
||||
}
|
||||
|
||||
version_tracker::~version_tracker() {
|
||||
if (_expired_at) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
@@ -1181,8 +1192,8 @@ version_tracker::~version_tracker() {
|
||||
}
|
||||
}
|
||||
|
||||
version_tracker shared_token_metadata::new_tracker(token_metadata::version_t version) {
|
||||
auto tracker = version_tracker(_versions_barrier.start(), version);
|
||||
version_tracker shared_token_metadata::new_tracker(const token_metadata& tm) {
|
||||
auto tracker = version_tracker(_versions_barrier.start(), tm);
|
||||
_trackers.push_front(tracker);
|
||||
return tracker;
|
||||
}
|
||||
@@ -1198,6 +1209,18 @@ void shared_token_metadata::clear_and_dispose(std::unique_ptr<token_metadata_imp
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_map<service::topology::version_t, int> shared_token_metadata::describe_stale_versions() {
|
||||
std::unordered_map<service::topology::version_t, int> result;
|
||||
const auto active_version = _shared.get()->get_version();
|
||||
for (const auto& t: _trackers) {
|
||||
const auto v = t.version();
|
||||
if (v < active_version) {
|
||||
result.emplace(v, t.version_use_count());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
|
||||
if (_shared->get_ring_version() >= tmptr->get_ring_version()) {
|
||||
on_internal_error(tlogger, format("shared_token_metadata: must not set non-increasing ring_version: {} -> {}", _shared->get_ring_version(), tmptr->get_ring_version()));
|
||||
@@ -1211,7 +1234,7 @@ void shared_token_metadata::set(mutable_token_metadata_ptr tmptr) noexcept {
|
||||
|
||||
tmptr->set_shared_token_metadata(*this);
|
||||
_shared = std::move(tmptr);
|
||||
_shared->set_version_tracker(new_tracker(_shared->get_version()));
|
||||
_shared->set_version_tracker(new_tracker(*_shared));
|
||||
|
||||
for (auto&& v : _trackers) {
|
||||
if (v.version() != _shared->get_version()) {
|
||||
|
||||
@@ -112,6 +112,7 @@ public:
|
||||
private:
|
||||
utils::phased_barrier::operation _op;
|
||||
service::topology::version_t _version;
|
||||
const token_metadata* _tm = nullptr;
|
||||
link_type _link;
|
||||
|
||||
// When engaged it means the version is no longer latest and should be released soon as to
|
||||
@@ -120,8 +121,7 @@ private:
|
||||
std::chrono::steady_clock::duration _log_threshold;
|
||||
public:
|
||||
version_tracker() = default;
|
||||
version_tracker(utils::phased_barrier::operation op, service::topology::version_t version)
|
||||
: _op(std::move(op)), _version(version) {}
|
||||
version_tracker(utils::phased_barrier::operation op, const token_metadata& tm);
|
||||
version_tracker(version_tracker&&) noexcept = default;
|
||||
version_tracker& operator=(version_tracker&& o) noexcept {
|
||||
if (this != &o) {
|
||||
@@ -137,6 +137,8 @@ public:
|
||||
return _version;
|
||||
}
|
||||
|
||||
long version_use_count() const;
|
||||
|
||||
void mark_expired(std::chrono::steady_clock::duration log_threshold) {
|
||||
if (!_expired_at) {
|
||||
_expired_at = std::chrono::steady_clock::now();
|
||||
@@ -172,7 +174,7 @@ private:
|
||||
friend class token_metadata_impl;
|
||||
};
|
||||
|
||||
class token_metadata final {
|
||||
class token_metadata final: public enable_lw_shared_from_this<token_metadata> {
|
||||
shared_token_metadata* _shared_token_metadata = nullptr;
|
||||
std::unique_ptr<token_metadata_impl> _impl;
|
||||
private:
|
||||
@@ -410,7 +412,7 @@ class shared_token_metadata : public peering_sharded_service<shared_token_metada
|
||||
boost::intrusive::constant_time_size<false>>;
|
||||
version_tracker_list_type _trackers;
|
||||
private:
|
||||
version_tracker new_tracker(token_metadata::version_t);
|
||||
version_tracker new_tracker(const token_metadata& tm);
|
||||
public:
|
||||
// used to construct the shared object as a sharded<> instance
|
||||
// lock_func returns semaphore_units<>
|
||||
@@ -419,7 +421,7 @@ public:
|
||||
, _lock_func(std::move(lock_func))
|
||||
, _versions_barrier("shared_token_metadata::versions_barrier")
|
||||
{
|
||||
_shared->set_version_tracker(new_tracker(_shared->get_version()));
|
||||
_shared->set_version_tracker(new_tracker(*_shared));
|
||||
}
|
||||
|
||||
shared_token_metadata(const shared_token_metadata& x) = delete;
|
||||
@@ -446,6 +448,9 @@ public:
|
||||
_stall_detector_threshold = threshold;
|
||||
}
|
||||
|
||||
// Returns a map version -> use_count
|
||||
std::unordered_map<service::topology::version_t, int> describe_stale_versions();
|
||||
|
||||
future<> stale_versions_in_use() const {
|
||||
return _stale_versions_in_use.get_future();
|
||||
}
|
||||
|
||||
15
main.cc
15
main.cc
@@ -571,7 +571,7 @@ sharded<service::storage_proxy> *the_storage_proxy;
|
||||
// This is used by perf-alternator to allow running scylla together with the tool
|
||||
// in a single process. So that it's easier to measure internals. It's not added
|
||||
// to main_func_type to not complicate common flow as no other tool needs such logic.
|
||||
std::function<void(lw_shared_ptr<db::config>)> after_init_func;
|
||||
std::function<future<>(lw_shared_ptr<db::config>, sharded<abort_source>&)> after_init_func;
|
||||
|
||||
static locator::host_id initialize_local_info_thread(sharded<db::system_keyspace>& sys_ks,
|
||||
sharded<locator::snitch_ptr>& snitch,
|
||||
@@ -2071,11 +2071,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
gossiper.local().unregister_(mm.local().shared_from_this()).get();
|
||||
});
|
||||
|
||||
utils::loading_cache_config perm_cache_config;
|
||||
perm_cache_config.max_size = cfg->permissions_cache_max_entries();
|
||||
perm_cache_config.expiry = std::chrono::milliseconds(cfg->permissions_validity_in_ms());
|
||||
perm_cache_config.refresh = std::chrono::milliseconds(cfg->permissions_update_interval_in_ms());
|
||||
|
||||
auto start_auth_service = [&mm] (sharded<auth::service>& auth_service, std::any& stop_auth_service, const char* what) {
|
||||
auth_service.invoke_on_all(&auth::service::start, std::ref(mm), std::ref(sys_ks)).get();
|
||||
|
||||
@@ -2104,7 +2099,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
maintenance_auth_config.authenticator_java_name = sstring{auth::allow_all_authenticator_name};
|
||||
maintenance_auth_config.role_manager_java_name = sstring{auth::maintenance_socket_role_manager_name};
|
||||
|
||||
maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
|
||||
maintenance_auth_service.start(std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();
|
||||
|
||||
cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);
|
||||
|
||||
@@ -2371,7 +2366,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
auth_config.authenticator_java_name = qualified_authenticator_name;
|
||||
auth_config.role_manager_java_name = qualified_role_manager_name;
|
||||
|
||||
auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();
|
||||
auth_service.start(std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();
|
||||
|
||||
std::any stop_auth_service;
|
||||
// Has to be called after node joined the cluster (join_cluster())
|
||||
@@ -2581,11 +2576,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
supervisor::notify("serving");
|
||||
|
||||
startlog.info("Scylla version {} initialization completed.", scylla_version());
|
||||
future<> after_init_fut = make_ready_future<>();
|
||||
if (after_init_func) {
|
||||
after_init_func(cfg);
|
||||
after_init_fut = after_init_func(cfg, stop_signal.as_sharded_abort_source());
|
||||
}
|
||||
stop_signal.wait().get();
|
||||
startlog.info("Signal received; shutting down");
|
||||
std::move(after_init_fut).get();
|
||||
// At this point, all objects destructors and all shutdown hooks registered with defer() are executed
|
||||
} catch (const sleep_aborted&) {
|
||||
startlog.info("Startup interrupted");
|
||||
|
||||
@@ -272,25 +272,27 @@ private:
|
||||
|
||||
bool can_purge_tombstone(const tombstone& t, is_shadowable is_shadowable, const gc_clock::time_point deletion_time) {
|
||||
max_purgeable::can_purge_result purge_res { };
|
||||
std::optional<bool> expired;
|
||||
|
||||
if (_tombstone_gc_state.cheap_to_get_gc_before(_schema)) {
|
||||
// if retrieval of grace period is cheap, can_gc() will only be
|
||||
// called for tombstones that are older than grace period, in
|
||||
// order to avoid unnecessary bloom filter checks when calculating
|
||||
// max purgeable timestamp.
|
||||
purge_res.can_purge = satisfy_grace_period(deletion_time);
|
||||
expired = purge_res.can_purge = satisfy_grace_period(deletion_time);
|
||||
if (purge_res.can_purge) {
|
||||
purge_res = can_gc(t, is_shadowable);
|
||||
}
|
||||
} else {
|
||||
purge_res = can_gc(t, is_shadowable);
|
||||
if (purge_res.can_purge) {
|
||||
purge_res.can_purge = satisfy_grace_period(deletion_time);
|
||||
expired = purge_res.can_purge = satisfy_grace_period(deletion_time);
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (sstable_compaction()) {
|
||||
if (!_tombstone_stats || !t) {
|
||||
// Tombstone GC stats only account for expired tombstones (those eligible for GC).
|
||||
if (!_tombstone_stats || !t || !expired.value_or(satisfy_grace_period(deletion_time))) {
|
||||
return purge_res.can_purge;
|
||||
}
|
||||
|
||||
|
||||
@@ -459,7 +459,7 @@ future<> server_impl::wait_for_state_change(seastar::abort_source* as) {
|
||||
}
|
||||
|
||||
try {
|
||||
return as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future();
|
||||
co_await (as ? _state_change_promise->get_shared_future(*as) : _state_change_promise->get_shared_future());
|
||||
} catch (abort_requested_exception&) {
|
||||
throw request_aborted(fmt::format(
|
||||
"Aborted while waiting for state change on server: {}, latest applied entry: {}, current state: {}", _id, _applied_idx, _fsm->current_state()));
|
||||
|
||||
@@ -252,6 +252,10 @@ public:
|
||||
//
|
||||
// The caller may pass a pointer to an abort_source to make the function abortable.
|
||||
// It it passes nullptr, the function is unabortable.
|
||||
//
|
||||
// Exceptions:
|
||||
// raft::request_aborted
|
||||
// Thrown if abort is requested before the operation finishes.
|
||||
virtual future<> wait_for_state_change(seastar::abort_source* as) = 0;
|
||||
|
||||
// The returned future is resolved when a leader is elected for the current term.
|
||||
@@ -262,6 +266,10 @@ public:
|
||||
//
|
||||
// The caller may pass a pointer to an abort_source to make the function abortable.
|
||||
// It it passes nullptr, the function is unabortable.
|
||||
//
|
||||
// Exceptions:
|
||||
// raft::request_aborted
|
||||
// Thrown if abort is requested before the operation finishes.
|
||||
virtual future<> wait_for_leader(seastar::abort_source* as) = 0;
|
||||
|
||||
// Manually trigger snapshot creation and log truncation.
|
||||
|
||||
@@ -1503,12 +1503,10 @@ keyspace::make_column_family_config(const schema& s, const database& db) const {
|
||||
cfg.compaction_concurrency_semaphore = _config.compaction_concurrency_semaphore;
|
||||
cfg.cf_stats = _config.cf_stats;
|
||||
cfg.enable_incremental_backups = _config.enable_incremental_backups;
|
||||
cfg.compaction_scheduling_group = _config.compaction_scheduling_group;
|
||||
cfg.memory_compaction_scheduling_group = _config.memory_compaction_scheduling_group;
|
||||
cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
|
||||
cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group;
|
||||
cfg.streaming_scheduling_group = _config.streaming_scheduling_group;
|
||||
cfg.statement_scheduling_group = _config.statement_scheduling_group;
|
||||
cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
|
||||
cfg.enable_node_aggregated_table_metrics = db_config.enable_node_aggregated_table_metrics();
|
||||
cfg.tombstone_warn_threshold = db_config.tombstone_warn_threshold();
|
||||
@@ -2452,12 +2450,10 @@ database::make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_
|
||||
cfg.cf_stats = &_cf_stats;
|
||||
cfg.enable_incremental_backups = _enable_incremental_backups;
|
||||
|
||||
cfg.compaction_scheduling_group = _dbcfg.compaction_scheduling_group;
|
||||
cfg.memory_compaction_scheduling_group = _dbcfg.memory_compaction_scheduling_group;
|
||||
cfg.memtable_scheduling_group = _dbcfg.memtable_scheduling_group;
|
||||
cfg.memtable_to_cache_scheduling_group = _dbcfg.memtable_to_cache_scheduling_group;
|
||||
cfg.streaming_scheduling_group = _dbcfg.streaming_scheduling_group;
|
||||
cfg.statement_scheduling_group = _dbcfg.statement_scheduling_group;
|
||||
cfg.enable_metrics_reporting = _cfg.enable_keyspace_column_family_metrics();
|
||||
|
||||
cfg.view_update_memory_semaphore_limit = max_memory_pending_view_updates();
|
||||
@@ -3781,7 +3777,7 @@ future<utils::chunked_vector<temporary_buffer<char>>> database::sample_data_file
|
||||
&result,
|
||||
chunk_size
|
||||
] (database& local_db, state_by_shard& local_state) -> future<> {
|
||||
auto ticket = get_units(local_db._sample_data_files_local_concurrency_limiter, 1);
|
||||
auto ticket = co_await get_units(local_db._sample_data_files_local_concurrency_limiter, 1);
|
||||
|
||||
// In `chosen_chunks`, the sorted array of chosen chunk offsets (in the "global chunk list"),
|
||||
// find the range of offsets which belongs to us.
|
||||
|
||||
@@ -466,9 +466,7 @@ public:
|
||||
replica::cf_stats* cf_stats = nullptr;
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group;
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
bool enable_node_aggregated_table_metrics = true;
|
||||
@@ -1405,9 +1403,7 @@ public:
|
||||
replica::cf_stats* cf_stats = nullptr;
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group;
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
size_t view_update_memory_semaphore_limit;
|
||||
@@ -1795,8 +1791,6 @@ public:
|
||||
return &_cf_stats;
|
||||
}
|
||||
|
||||
seastar::scheduling_group get_streaming_scheduling_group() const { return _dbcfg.streaming_scheduling_group; }
|
||||
|
||||
seastar::scheduling_group get_gossip_scheduling_group() const { return _dbcfg.gossip_scheduling_group; }
|
||||
|
||||
compaction::compaction_manager& get_compaction_manager() {
|
||||
|
||||
@@ -2754,7 +2754,7 @@ public:
|
||||
return _cg.get_backlog_tracker();
|
||||
}
|
||||
const std::string get_group_id() const noexcept override {
|
||||
return fmt::format("{}", _cg.group_id());
|
||||
return fmt::to_string(_cg.group_id());
|
||||
}
|
||||
|
||||
seastar::condition_variable& get_staging_done_condition() noexcept override {
|
||||
@@ -4964,7 +4964,6 @@ future<> table::cleanup_tablet(database& db, db::system_keyspace& sys_ks, locato
|
||||
co_await stop_compaction_groups(sg);
|
||||
co_await utils::get_local_injector().inject("delay_tablet_compaction_groups_cleanup", std::chrono::seconds(5));
|
||||
co_await cleanup_compaction_groups(db, sys_ks, tid, sg);
|
||||
co_await utils::get_local_injector().inject("tablet_cleanup_completion_wait", utils::wait_for_message(std::chrono::seconds(5)));
|
||||
}
|
||||
|
||||
future<> table::cleanup_tablet_without_deallocation(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid) {
|
||||
|
||||
@@ -72,7 +72,7 @@ void group0_state_id_handler::refresh() {
|
||||
const auto min_state_id = std::ranges::min(group0_members_state_ids, [](auto a, auto b) {
|
||||
if (!a || !b) {
|
||||
// This should never happen, but if it does, it's a bug.
|
||||
on_fatal_internal_error(slogger, "unexpected empty state_id");
|
||||
on_internal_error(slogger, "unexpected empty state_id");
|
||||
}
|
||||
return utils::timeuuid_tri_compare(a, b) < 0;
|
||||
});
|
||||
|
||||
@@ -149,19 +149,31 @@ public:
|
||||
const auto& node = nodes_info.at(voter_id);
|
||||
|
||||
if (node.is_alive) {
|
||||
SCYLLA_ASSERT(_alive_nodes_remaining > 0);
|
||||
if (_alive_nodes_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no alive nodes remaining, but node {} is alive", voter_id));
|
||||
}
|
||||
--_alive_nodes_remaining;
|
||||
if (node.is_leader) {
|
||||
SCYLLA_ASSERT(_owns_alive_leader);
|
||||
if (!_owns_alive_leader) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: rack doesn't own a live leader, but leader {} is alive", voter_id));
|
||||
}
|
||||
_owns_alive_leader = false;
|
||||
}
|
||||
}
|
||||
if (node.is_voter) {
|
||||
if (node.is_alive) {
|
||||
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
|
||||
if (_existing_alive_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no live voters remaining, but voter {} is alive", voter_id));
|
||||
}
|
||||
--_existing_alive_voters_remaining;
|
||||
} else {
|
||||
SCYLLA_ASSERT(_existing_dead_voters_remaining > 0);
|
||||
if (_existing_dead_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("rack_info: no dead voters remaining, but voter {} is dead", voter_id));
|
||||
}
|
||||
--_existing_dead_voters_remaining;
|
||||
}
|
||||
}
|
||||
@@ -279,16 +291,25 @@ public:
|
||||
|
||||
if (node.is_alive) {
|
||||
if (node.is_voter) {
|
||||
SCYLLA_ASSERT(_existing_alive_voters_remaining > 0);
|
||||
if (_existing_alive_voters_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: no live voters remaining, but voter {} is alive", *voter_id));
|
||||
}
|
||||
--_existing_alive_voters_remaining;
|
||||
}
|
||||
if (node.is_leader) {
|
||||
SCYLLA_ASSERT(_owns_alive_leader);
|
||||
if (!_owns_alive_leader) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: DC doesn't own a live leader, but leader {} is alive", *voter_id));
|
||||
}
|
||||
_owns_alive_leader = false;
|
||||
}
|
||||
}
|
||||
|
||||
SCYLLA_ASSERT(_nodes_remaining > 0);
|
||||
if (_nodes_remaining == 0) {
|
||||
on_internal_error(rvlogger,
|
||||
format("datacenter_info: no nodes remaining, but voter {} belongs to this DC", *voter_id));
|
||||
}
|
||||
|
||||
--_nodes_remaining;
|
||||
++_assigned_voters_count;
|
||||
|
||||
@@ -612,8 +612,13 @@ private:
|
||||
try {
|
||||
// FIXME: get_schema_for_write() doesn't timeout
|
||||
schema_ptr s = co_await get_schema_for_write(schema_version, reply_to_host_id, shard, timeout);
|
||||
|
||||
// This erm ensures that tablet migrations wait for replica requests,
|
||||
// even if the coordinator is no longer available.
|
||||
const auto erm = s->table().get_effective_replication_map();
|
||||
|
||||
// Note: blocks due to execution_stage in replica::database::apply()
|
||||
co_await p->run_fenceable_write(s->table().get_effective_replication_map()->get_replication_strategy(),
|
||||
co_await p->run_fenceable_write(erm->get_replication_strategy(),
|
||||
fence, src_addr,
|
||||
[&] { return apply_fn(p, trace_state_ptr, std::move(s), m, timeout); });
|
||||
// We wait for send_mutation_done to complete, otherwise, if reply_to is busy, we will accumulate
|
||||
@@ -863,6 +868,10 @@ private:
|
||||
slogger.info("storage_proxy::handle_read injection done");
|
||||
});
|
||||
|
||||
// This erm ensures that tablet migrations wait for replica requests,
|
||||
// even if the coordinator is no longer available.
|
||||
auto erm = s->table().get_effective_replication_map();
|
||||
|
||||
auto pr2 = ::compat::unwrap(std::move(pr), *s);
|
||||
auto do_query = [&]() {
|
||||
if constexpr (verb == read_verb::read_data) {
|
||||
@@ -870,7 +879,6 @@ private:
|
||||
// this function assumes singular queries but doesn't validate
|
||||
throw std::runtime_error("READ_DATA called with wrapping range");
|
||||
}
|
||||
auto erm = s->table().get_effective_replication_map();
|
||||
p->get_stats().replica_data_reads++;
|
||||
if (!oda) {
|
||||
throw std::runtime_error("READ_DATA called without digest algorithm");
|
||||
@@ -987,6 +995,12 @@ private:
|
||||
|
||||
auto schema = co_await get_schema_for_read(cmd.schema_version, src_addr, src_shard, *timeout);
|
||||
dht::token token = dht::get_token(*schema, key);
|
||||
|
||||
// This guard ensures that tablet migrations wait for replica requests,
|
||||
// even if the LWT coordinator is no longer available.
|
||||
locator::token_metadata_guard guard(schema->table(), token);
|
||||
co_await _sp.apply_fence(fence_opt, src_addr);
|
||||
|
||||
unsigned shard = schema->table().shard_for_reads(token);
|
||||
bool local = shard == this_shard_id();
|
||||
_sp.get_stats().replica_cross_shard_ops += !local;
|
||||
@@ -1027,6 +1041,12 @@ private:
|
||||
});
|
||||
auto schema = co_await get_schema_for_read(proposal.update.schema_version(), src_addr, src_shard, *timeout);
|
||||
dht::token token = proposal.update.decorated_key(*schema).token();
|
||||
|
||||
// This guard ensures that tablet migrations wait for replica requests,
|
||||
// even if the LWT coordinator is no longer available.
|
||||
locator::token_metadata_guard guard(schema->table(), token);
|
||||
co_await _sp.apply_fence(fence_opt, src_addr);
|
||||
|
||||
unsigned shard = schema->table().shard_for_reads(token);
|
||||
bool local = shard == this_shard_id();
|
||||
_sp.get_stats().replica_cross_shard_ops += !local;
|
||||
@@ -1068,6 +1088,12 @@ private:
|
||||
auto d = defer([] { pruning--; });
|
||||
auto schema = co_await get_schema_for_read(schema_id, src_addr, src_shard, *timeout);
|
||||
dht::token token = dht::get_token(*schema, key);
|
||||
|
||||
// This guard ensures that tablet migrations wait for replica requests,
|
||||
// even if the LWT coordinator is no longer available.
|
||||
locator::token_metadata_guard guard(schema->table(), token);
|
||||
co_await _sp.apply_fence(fence_opt, src_addr);
|
||||
|
||||
unsigned shard = schema->table().shard_for_reads(token);
|
||||
bool local = shard == this_shard_id();
|
||||
_sp.get_stats().replica_cross_shard_ops += !local;
|
||||
|
||||
@@ -632,7 +632,9 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
co_await update_topology_change_info(tmptr, ::format("{} {}/{}", rs.state, id, ip));
|
||||
break;
|
||||
case node_state::replacing: {
|
||||
SCYLLA_ASSERT(_topology_state_machine._topology.req_param.contains(id));
|
||||
if (!_topology_state_machine._topology.req_param.contains(id)) {
|
||||
on_internal_error(rtlogger, format("No request parameters for replacing node {}", id));
|
||||
}
|
||||
auto replaced_id = std::get<replace_param>(_topology_state_machine._topology.req_param[id]).replaced_id;
|
||||
auto existing_ip = _address_map.find(locator::host_id{replaced_id.uuid()});
|
||||
const auto replaced_host_id = locator::host_id(replaced_id.uuid());
|
||||
@@ -649,7 +651,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
|
||||
co_await process_normal_node(id, host_id, ip, rs);
|
||||
break;
|
||||
default:
|
||||
on_fatal_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
|
||||
on_internal_error(rtlogger, ::format("Unexpected state {} for node {}", rs.state, id));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -6140,7 +6142,7 @@ future<> storage_service::snitch_reconfigured() {
|
||||
}
|
||||
}
|
||||
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd) {
|
||||
future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd) {
|
||||
raft_topology_cmd_result result;
|
||||
rtlogger.info("topology cmd rpc {} is called index={}", cmd.cmd, cmd_index);
|
||||
|
||||
@@ -6225,8 +6227,9 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
co_await container().invoke_on_all([version] (storage_service& ss) -> future<> {
|
||||
const auto current_version = ss._shared_token_metadata.get()->get_version();
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, current version {}",
|
||||
version, current_version);
|
||||
rtlogger.info("Got raft_topology_cmd::barrier_and_drain, version {}, "
|
||||
"current version {}, stale versions (version: use_count): {}",
|
||||
version, current_version, ss._shared_token_metadata.describe_stale_versions());
|
||||
|
||||
// This shouldn't happen under normal operation, it's only plausible
|
||||
// if the topology change coordinator has
|
||||
@@ -6270,7 +6273,11 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
break;
|
||||
case raft_topology_cmd::command::stream_ranges: {
|
||||
co_await with_scheduling_group(_stream_manager.local().get_scheduling_group(), coroutine::lambda([&] () -> future<> {
|
||||
const auto rs = _topology_state_machine._topology.find(id)->second;
|
||||
const auto* server_rs = _topology_state_machine._topology.find(id);
|
||||
if (!server_rs) {
|
||||
on_internal_error(rtlogger, format("Got {} request for node {} not found in topology", cmd.cmd, id));
|
||||
}
|
||||
const auto& rs = server_rs->second;
|
||||
auto tstate = _topology_state_machine._topology.tstate;
|
||||
auto session = _topology_state_machine._topology.session;
|
||||
if (!rs.ring || rs.ring->tokens.empty()) {
|
||||
@@ -7322,11 +7329,15 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
|
||||
const locator::host_id this_host = _db.local().get_token_metadata().get_my_id();
|
||||
|
||||
uint64_t sum_tablet_sizes = 0;
|
||||
// Align to 64 bytes to avoid cache line ping-pong when updating size in map_reduce0() below
|
||||
struct alignas(64) aligned_tablet_size {
|
||||
uint64_t size = 0;
|
||||
};
|
||||
std::vector<aligned_tablet_size> tablet_sizes_per_shard(smp::count);
|
||||
|
||||
// Each node combines a per-table load map from all of its shards and returns it to the coordinator.
|
||||
// So if there are 1k nodes, there will be 1k RPCs in total.
|
||||
auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &sum_tablet_sizes] (replica::database& db) -> future<locator::load_stats> {
|
||||
auto load_stats = co_await _db.map_reduce0([&table_ids, &this_host, &tablet_sizes_per_shard] (replica::database& db) -> future<locator::load_stats> {
|
||||
locator::load_stats load_stats{};
|
||||
auto& tables_metadata = db.get_tables_metadata();
|
||||
|
||||
@@ -7364,7 +7375,7 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
|
||||
locator::combined_load_stats combined_ls { table->table_load_stats(tablet_filter) };
|
||||
load_stats.tables.emplace(id, std::move(combined_ls.table_ls));
|
||||
sum_tablet_sizes += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
tablet_sizes_per_shard[this_shard_id()].size += load_stats.tablet_stats[this_host].add_tablet_sizes(combined_ls.tablet_ls);
|
||||
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
@@ -7383,6 +7394,10 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
|
||||
if (config_capacity != 0) {
|
||||
tls.effective_capacity = config_capacity;
|
||||
} else {
|
||||
uint64_t sum_tablet_sizes = 0;
|
||||
for (const auto& ts : tablet_sizes_per_shard) {
|
||||
sum_tablet_sizes += ts.size;
|
||||
}
|
||||
tls.effective_capacity = si.available + sum_tablet_sizes;
|
||||
}
|
||||
|
||||
|
||||
@@ -974,7 +974,7 @@ private:
|
||||
|
||||
std::unordered_set<raft::server_id> find_raft_nodes_from_hoeps(const locator::host_id_or_endpoint_list& hoeps) const;
|
||||
|
||||
future<raft_topology_cmd_result> raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, const raft_topology_cmd& cmd);
|
||||
future<raft_topology_cmd_result> raft_topology_cmd_handler(raft::term_t term, uint64_t cmd_index, raft_topology_cmd cmd);
|
||||
|
||||
future<> raft_decommission();
|
||||
future<> raft_removenode(locator::host_id host_id, locator::host_id_or_endpoint_list ignore_nodes_params);
|
||||
|
||||
@@ -887,8 +887,8 @@ class load_balancer {
|
||||
//
|
||||
// We allow at least two sessions per shard so that there is less chance for idling until load balancer
|
||||
// makes the next decision after streaming is finished.
|
||||
const size_t max_write_streaming_load = 2;
|
||||
const size_t max_read_streaming_load = 4;
|
||||
size_t max_write_streaming_load;
|
||||
size_t max_read_streaming_load;
|
||||
|
||||
replica::database& _db;
|
||||
token_metadata_ptr _tm;
|
||||
@@ -1024,6 +1024,8 @@ public:
|
||||
lblogger.info("Size based load balancing cluster feature disabled; forcing capacity based balancing");
|
||||
_force_capacity_based_balancing = true;
|
||||
}
|
||||
max_read_streaming_load = db.get_config().tablet_streaming_read_concurrency_per_shard();
|
||||
max_write_streaming_load = db.get_config().tablet_streaming_write_concurrency_per_shard();
|
||||
}
|
||||
|
||||
bool ongoing_rack_list_colocation() const {
|
||||
@@ -1109,6 +1111,11 @@ public:
|
||||
if (!is_auto_repair_enabled(config)) {
|
||||
co_return false;
|
||||
}
|
||||
auto size = info.replicas.size();
|
||||
if (size <= 1) {
|
||||
lblogger.debug("Skipped auto repair for tablet={} replicas={}", gid, size);
|
||||
co_return false;
|
||||
}
|
||||
auto threshold = _db.get_config().auto_repair_threshold_default_in_seconds();
|
||||
auto repair_time_threshold = std::chrono::seconds(threshold);
|
||||
auto& last_repair_time = info.repair_time;
|
||||
@@ -2163,7 +2170,7 @@ public:
|
||||
continue;
|
||||
}
|
||||
auto load = nodes[r.host].shards[r.shard].streaming_read_load;
|
||||
if (load + info.stream_weight > max_read_streaming_load) {
|
||||
if (load > 0 && load + info.stream_weight > max_read_streaming_load) {
|
||||
lblogger.debug("Migration skipped because of read load limit on {} ({})", r, load);
|
||||
return false;
|
||||
}
|
||||
@@ -2173,7 +2180,7 @@ public:
|
||||
continue;
|
||||
}
|
||||
auto load = nodes[r.host].shards[r.shard].streaming_write_load;
|
||||
if (load + info.stream_weight > max_write_streaming_load) {
|
||||
if (load > 0 && load + info.stream_weight > max_write_streaming_load) {
|
||||
lblogger.debug("Migration skipped because of write load limit on {} ({})", r, load);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -331,12 +331,17 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
auto [id, req] = *next_req;
|
||||
|
||||
auto* server_rs = topo.find(id);
|
||||
if (!server_rs) {
|
||||
on_internal_error(rtlogger, format("Node {} has a pending {} request but is not found in topology", id, req));
|
||||
}
|
||||
|
||||
if (cleanup_needed && (req == topology_request::remove || req == topology_request::leave)) {
|
||||
// If the highest prio request is removenode or decommission we need to start cleanup if one is needed
|
||||
return start_vnodes_cleanup(std::move(guard), req, id);
|
||||
}
|
||||
|
||||
return node_to_work_on(std::move(guard), &topo, id, &topo.find(id)->second, req, get_request_param(id));
|
||||
return node_to_work_on(std::move(guard), &topo, id, &server_rs->second, req, get_request_param(id));
|
||||
};
|
||||
|
||||
node_to_work_on get_node_to_work_on(group0_guard guard) const {
|
||||
@@ -373,7 +378,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
auto& topo = _topo_sm._topology;
|
||||
|
||||
auto it = topo.find(id);
|
||||
SCYLLA_ASSERT(it);
|
||||
if (!it) {
|
||||
on_internal_error(rtlogger, format("retake_node: node {} not found in topology", id));
|
||||
}
|
||||
|
||||
std::optional<topology_request> req;
|
||||
auto rit = topo.requests.find(id);
|
||||
@@ -410,7 +417,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
return service::topology::parse_replaced_node(req_param);
|
||||
}
|
||||
|
||||
future<> exec_direct_command_helper(raft::server_id id, uint64_t cmd_index, const raft_topology_cmd& cmd) {
|
||||
future<> exec_direct_command_helper(raft::server_id id, uint64_t cmd_index, raft_topology_cmd cmd) {
|
||||
rtlogger.debug("send {} command with term {} and index {} to {}",
|
||||
cmd.cmd, _term, cmd_index, id);
|
||||
_topology_cmd_rpc_tracker.active_dst.emplace(id);
|
||||
@@ -426,7 +433,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
};
|
||||
|
||||
future<node_to_work_on> exec_direct_command(node_to_work_on&& node, const raft_topology_cmd& cmd) {
|
||||
future<node_to_work_on> exec_direct_command(node_to_work_on&& node, raft_topology_cmd cmd) {
|
||||
auto id = node.id;
|
||||
release_node(std::move(node));
|
||||
const auto cmd_index = ++_last_cmd_index;
|
||||
@@ -436,7 +443,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
co_return retake_node(co_await start_operation(), id);
|
||||
};
|
||||
|
||||
future<> exec_global_command_helper(auto nodes, const raft_topology_cmd& cmd) {
|
||||
future<> exec_global_command_helper(auto nodes, raft_topology_cmd cmd) {
|
||||
const auto cmd_index = ++_last_cmd_index;
|
||||
_topology_cmd_rpc_tracker.current = cmd.cmd;
|
||||
_topology_cmd_rpc_tracker.index = cmd_index;
|
||||
@@ -453,7 +460,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
};
|
||||
|
||||
future<group0_guard> exec_global_command(
|
||||
group0_guard guard, const raft_topology_cmd& cmd,
|
||||
group0_guard guard, raft_topology_cmd cmd,
|
||||
const std::unordered_set<raft::server_id>& exclude_nodes,
|
||||
drop_guard_and_retake drop_and_retake = drop_guard_and_retake::yes) {
|
||||
rtlogger.info("executing global topology command {}, excluded nodes: {}", cmd.cmd, exclude_nodes);
|
||||
@@ -1208,13 +1215,16 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
rtlogger.info("enabled features: {}", features_to_enable);
|
||||
}
|
||||
|
||||
future<group0_guard> global_token_metadata_barrier(group0_guard&& guard, std::unordered_set<raft::server_id> exclude_nodes = {}, bool* fenced = nullptr) {
|
||||
future<group0_guard> global_token_metadata_barrier(group0_guard&& guard, std::unordered_set<raft::server_id> exclude_nodes = {}, bool* fenced = nullptr, bool drain_all_nodes = false) {
|
||||
auto version = _topo_sm._topology.version;
|
||||
bool drain_failed = false;
|
||||
try {
|
||||
guard = co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier_and_drain, exclude_nodes, drop_guard_and_retake::yes);
|
||||
} catch (...) {
|
||||
rtlogger.warn("drain rpc failed, proceed to fence old writes: {}", std::current_exception());
|
||||
if (drain_all_nodes) {
|
||||
throw;
|
||||
}
|
||||
drain_failed = true;
|
||||
}
|
||||
if (drain_failed) {
|
||||
@@ -1240,7 +1250,30 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
future<group0_guard> global_tablet_token_metadata_barrier(group0_guard guard) {
|
||||
// FIXME: Don't require all nodes to be up, only tablet replicas.
|
||||
return global_token_metadata_barrier(std::move(guard), _topo_sm._topology.ignored_nodes);
|
||||
|
||||
// Let x be the current topology version, post-conditions of this barrier:
|
||||
// * there are no coordinators with versions < x and no such coordinators
|
||||
// are possible in the future
|
||||
// * no replicas are currently executing requests with versions < x - 1
|
||||
// and no new such requests are possible in the future
|
||||
// Why? The barrier_and_drain handler runs group0.read_barrier() first,
|
||||
// which guarantees that the new version and the previous fence_version are
|
||||
// published on all shards before we drain them. After that we drain all
|
||||
// requests with versions < x ==> no current and future requests are possible
|
||||
// with versions < x - 1 since the fence for x - 1 is set. Future stale
|
||||
// requests with version x - 1 are sill possible until the next
|
||||
// global barrier.
|
||||
// * a quorum of replicas doesn't allow new requests with versions < x,
|
||||
// but there could be arbitrary number of already running read or mutation
|
||||
// requests with version x - 1 on those replicas
|
||||
// * some replicas could still be accepting new requests with versions == x - 1
|
||||
|
||||
bool* const fenced = nullptr;
|
||||
const auto drain_all_nodes = true;
|
||||
return global_token_metadata_barrier(std::move(guard),
|
||||
_topo_sm._topology.ignored_nodes,
|
||||
fenced,
|
||||
drain_all_nodes);
|
||||
}
|
||||
|
||||
// Represents a two-state state machine which changes monotonically
|
||||
@@ -2496,7 +2529,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
switch (node.rs->state) {
|
||||
case node_state::bootstrapping: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, format("Bootstrapping node {} owns tokens", node.id));
|
||||
}
|
||||
auto num_tokens = std::get<join_param>(node.req_param.value()).num_tokens;
|
||||
auto tokens_string = std::get<join_param>(node.req_param.value()).tokens_string;
|
||||
|
||||
@@ -2552,11 +2587,23 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
break;
|
||||
case node_state::replacing: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, format("Replacing node {} owns tokens", node.id));
|
||||
}
|
||||
auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
|
||||
auto it = _topo_sm._topology.normal_nodes.find(replaced_id);
|
||||
SCYLLA_ASSERT(it != _topo_sm._topology.normal_nodes.end());
|
||||
SCYLLA_ASSERT(it->second.ring && it->second.state == node_state::normal);
|
||||
if (it == _topo_sm._topology.normal_nodes.end()) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} not found in normal nodes", replaced_id, node.id));
|
||||
}
|
||||
if (!it->second.ring) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} is missing tokens", replaced_id, node.id));
|
||||
}
|
||||
if (it->second.state != node_state::normal) {
|
||||
on_internal_error(rtlogger,
|
||||
format("Node {} being replaced by {} is not in normal state", replaced_id, node.id));
|
||||
}
|
||||
|
||||
topology_mutation_builder builder(node.guard.write_timestamp());
|
||||
|
||||
@@ -2955,7 +3002,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
}
|
||||
break;
|
||||
default:
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Ring state on node {} is write_both_read_new while the node is in state {}",
|
||||
node.id, node.rs->state));
|
||||
}
|
||||
@@ -3272,7 +3319,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
rtbuilder.set("start_time", db_clock::now());
|
||||
switch (node.request.value()) {
|
||||
case topology_request::join: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Joining node {} owns tokens", node.id));
|
||||
}
|
||||
// Write chosen tokens through raft.
|
||||
builder.set_transition_state(topology::transition_state::join_group0)
|
||||
.with_node(node.id)
|
||||
@@ -3284,7 +3333,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::leave: {
|
||||
SCYLLA_ASSERT(node.rs->ring);
|
||||
if (!node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Leaving node {} doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
@@ -3315,7 +3366,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::remove: {
|
||||
SCYLLA_ASSERT(node.rs->ring);
|
||||
if (!node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Node {} being removed doesn't own tokens", node.id));
|
||||
}
|
||||
|
||||
auto validation_result = validate_removing_node(_db, to_host_id(node.id));
|
||||
if (std::holds_alternative<node_validation_failure>(validation_result)) {
|
||||
@@ -3343,7 +3396,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
break;
|
||||
}
|
||||
case topology_request::replace: {
|
||||
SCYLLA_ASSERT(!node.rs->ring);
|
||||
if (node.rs->ring) {
|
||||
on_internal_error(rtlogger, ::format("Replacing node {} owns tokens", node.id));
|
||||
}
|
||||
|
||||
builder.set_transition_state(topology::transition_state::join_group0)
|
||||
.with_node(node.id)
|
||||
@@ -3400,12 +3455,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
case node_state::removing:
|
||||
case node_state::replacing:
|
||||
// Should not get here
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Found node {} in state {} but there is no ongoing topology transition",
|
||||
node.id, node.rs->state));
|
||||
case node_state::left:
|
||||
// Should not get here
|
||||
on_fatal_internal_error(rtlogger, ::format(
|
||||
on_internal_error(rtlogger, ::format(
|
||||
"Topology coordinator is called for node {} in state 'left'", node.id));
|
||||
break;
|
||||
}
|
||||
@@ -3467,7 +3522,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
|
||||
auto id = node.id;
|
||||
|
||||
SCYLLA_ASSERT(!_topo_sm._topology.transition_nodes.empty());
|
||||
if (_topo_sm._topology.transition_nodes.empty()) {
|
||||
on_internal_error(rtlogger, format("transition nodes are empty while accepting node {}", node.id));
|
||||
}
|
||||
|
||||
release_node(std::move(node));
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@ struct term_changed_error : public std::runtime_error {
|
||||
future<> wait_for_gossiper(raft::server_id id, const gms::gossiper& g, seastar::abort_source& as);
|
||||
|
||||
using raft_topology_cmd_handler_type = noncopyable_function<future<raft_topology_cmd_result>(
|
||||
raft::term_t, uint64_t, const raft_topology_cmd&)>;
|
||||
raft::term_t, uint64_t, raft_topology_cmd)>;
|
||||
|
||||
struct topology_coordinator_cmd_rpc_tracker {
|
||||
raft_topology_cmd::command current;
|
||||
|
||||
@@ -20,6 +20,8 @@ namespace db {
|
||||
|
||||
namespace service {
|
||||
|
||||
extern logging::logger rtlogger;
|
||||
|
||||
topology_mutation_builder::topology_mutation_builder(api::timestamp_type ts) :
|
||||
_s(db::system_keyspace::topology()),
|
||||
_m(_s, partition_key::from_singular(*_s, db::system_keyspace::TOPOLOGY)),
|
||||
@@ -35,7 +37,9 @@ topology_node_mutation_builder::topology_node_mutation_builder(topology_mutation
|
||||
template<typename Builder>
|
||||
Builder& topology_mutation_builder_base<Builder>::apply_atomic(const char* cell, const data_value& value) {
|
||||
const column_definition* cdef = self().schema().get_column_definition(cell);
|
||||
SCYLLA_ASSERT(cdef);
|
||||
if (!cdef) {
|
||||
on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
|
||||
}
|
||||
self().row().apply(*cdef, atomic_cell::make_live(*cdef->type, self().timestamp(), cdef->type->decompose(value), self().ttl()));
|
||||
return self();
|
||||
}
|
||||
@@ -45,7 +49,9 @@ template<std::ranges::range C>
|
||||
requires std::convertible_to<std::ranges::range_value_t<C>, data_value>
|
||||
Builder& topology_mutation_builder_base<Builder>::apply_set(const char* cell, collection_apply_mode apply_mode, const C& c) {
|
||||
const column_definition* cdef = self().schema().get_column_definition(cell);
|
||||
SCYLLA_ASSERT(cdef);
|
||||
if (!cdef) {
|
||||
on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
|
||||
}
|
||||
auto vtype = static_pointer_cast<const set_type_impl>(cdef->type)->get_elements_type();
|
||||
|
||||
std::set<bytes, serialized_compare> cset(vtype->as_less_comparator());
|
||||
@@ -70,7 +76,9 @@ Builder& topology_mutation_builder_base<Builder>::apply_set(const char* cell, co
|
||||
template<typename Builder>
|
||||
Builder& topology_mutation_builder_base<Builder>::del(const char* cell) {
|
||||
auto cdef = self().schema().get_column_definition(cell);
|
||||
SCYLLA_ASSERT(cdef);
|
||||
if (!cdef) {
|
||||
on_internal_error(rtlogger, format("column {} not found in the topology table", cell));
|
||||
}
|
||||
if (!cdef->type->is_multi_cell()) {
|
||||
self().row().apply(*cdef, atomic_cell::make_dead(self().timestamp(), gc_clock::now()));
|
||||
} else {
|
||||
|
||||
@@ -62,7 +62,7 @@ future<> sstable_dict_autotrainer::tick() {
|
||||
continue;
|
||||
}
|
||||
auto params = s->get_compressor_params();
|
||||
auto ticket = get_units(_ss.get_do_sample_sstables_concurrency_limiter(), 1);
|
||||
auto ticket = co_await get_units(_ss.get_do_sample_sstables_concurrency_limiter(), 1);
|
||||
// When we sample a block from a set of files, we can think of this block's
|
||||
// compression ratio like about a random variable, with value in [0;1] and
|
||||
// some underlying distribution.
|
||||
|
||||
356
test/alternator/test_timestamp_attribute.py
Normal file
356
test/alternator/test_timestamp_attribute.py
Normal file
@@ -0,0 +1,356 @@
|
||||
# Copyright 2025-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
|
||||
# Tests for the system:timestamp_attribute Scylla-specific feature.
|
||||
# This feature allows users to control the write timestamp of PutItem and
|
||||
# UpdateItem operations by specifying an attribute name in the table's
|
||||
# system:timestamp_attribute tag. When that attribute is present in the
|
||||
# write request with a numeric value (microseconds since Unix epoch), it
|
||||
# is used as the write timestamp. The attribute itself is not stored in
|
||||
# the item data.
|
||||
#
|
||||
# This is a Scylla-specific feature and is not tested on DynamoDB.
|
||||
|
||||
import time
|
||||
import pytest
|
||||
from botocore.exceptions import ClientError
|
||||
from decimal import Decimal
|
||||
|
||||
from .util import create_test_table, random_string
|
||||
|
||||
# A large timestamp in microseconds (far future, year ~2033)
|
||||
LARGE_TS = Decimal('2000000000000000')
|
||||
# A medium timestamp in microseconds (year ~2001)
|
||||
MEDIUM_TS = Decimal('1000000000000000')
|
||||
# A small timestamp in microseconds (year ~1970+)
|
||||
SMALL_TS = Decimal('100000000000000')
|
||||
|
||||
# Fixtures for tables with the system:timestamp_attribute tag. The tables
|
||||
# are created once per module and shared between all tests that use them,
|
||||
# to avoid the overhead of creating and deleting tables for each test.
|
||||
# Because system:timestamp_attribute is a Scylla-only feature, all tests
|
||||
# using these fixtures are implicitly Scylla-only (via scylla_only parameter).
|
||||
|
||||
# A table with only a hash key and system:timestamp_attribute='ts' tag.
|
||||
# We explicitly set write isolation to only_rmw_uses_lwt so the tests remain
|
||||
# correct even if the server default changes to always_use_lwt in the future.
|
||||
@pytest.fixture(scope="module")
|
||||
def test_table_ts(scylla_only, dynamodb):
|
||||
table = create_test_table(dynamodb,
|
||||
Tags=[{'Key': 'system:timestamp_attribute', 'Value': 'ts'},
|
||||
{'Key': 'system:write_isolation', 'Value': 'only_rmw_uses_lwt'}],
|
||||
KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
|
||||
AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}])
|
||||
yield table
|
||||
table.delete()
|
||||
|
||||
# A table with hash (string) and range (string) keys and system:timestamp_attribute='ts' tag.
|
||||
# We explicitly set write isolation to only_rmw_uses_lwt so the tests remain
|
||||
# correct even if the server default changes to always_use_lwt in the future.
|
||||
@pytest.fixture(scope="module")
|
||||
def test_table_ts_ss(scylla_only, dynamodb):
|
||||
table = create_test_table(dynamodb,
|
||||
Tags=[{'Key': 'system:timestamp_attribute', 'Value': 'ts'},
|
||||
{'Key': 'system:write_isolation', 'Value': 'only_rmw_uses_lwt'}],
|
||||
KeySchema=[
|
||||
{'AttributeName': 'p', 'KeyType': 'HASH'},
|
||||
{'AttributeName': 'c', 'KeyType': 'RANGE'},
|
||||
],
|
||||
AttributeDefinitions=[
|
||||
{'AttributeName': 'p', 'AttributeType': 'S'},
|
||||
{'AttributeName': 'c', 'AttributeType': 'S'},
|
||||
])
|
||||
yield table
|
||||
table.delete()
|
||||
|
||||
# A table with hash key, system:timestamp_attribute='ts' tag, and
|
||||
# system:write_isolation='always' to test rejection in LWT_ALWAYS mode.
|
||||
# In always_use_lwt mode, every write uses LWT, so the timestamp attribute
|
||||
# feature cannot be used at all.
|
||||
@pytest.fixture(scope="module")
|
||||
def test_table_ts_lwt(scylla_only, dynamodb):
|
||||
table = create_test_table(dynamodb,
|
||||
Tags=[{'Key': 'system:timestamp_attribute', 'Value': 'ts'},
|
||||
{'Key': 'system:write_isolation', 'Value': 'always'}],
|
||||
KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
|
||||
AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}])
|
||||
yield table
|
||||
table.delete()
|
||||
|
||||
# Test that PutItem with the timestamp attribute uses the given numeric
|
||||
# value as the write timestamp, and the timestamp attribute is NOT stored
|
||||
# in the item.
|
||||
def test_timestamp_attribute_put_item_basic(test_table_ts):
|
||||
p = random_string()
|
||||
# Put an item with the timestamp attribute
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello', 'ts': LARGE_TS})
|
||||
# Read the item back
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
# 'val' should be stored normally
|
||||
assert item['val'] == 'hello'
|
||||
# 'ts' (the timestamp attribute) should NOT be stored in the item
|
||||
assert 'ts' not in item
|
||||
|
||||
# Test that PutItem respects the write timestamp ordering: a write with a
|
||||
# larger timestamp should win over a write with a smaller timestamp,
|
||||
# regardless of wall-clock order.
|
||||
def test_timestamp_attribute_put_item_ordering(test_table_ts):
|
||||
p = random_string()
|
||||
# First, write item with a LARGE timestamp
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'large_ts', 'ts': LARGE_TS})
|
||||
# Then write item with a SMALL timestamp (should lose since SMALL < LARGE)
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'small_ts', 'ts': SMALL_TS})
|
||||
# The item with the larger timestamp (val='large_ts') should win
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'large_ts'
|
||||
|
||||
# Now try to overwrite with a LARGER timestamp (should win)
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'latest', 'ts': LARGE_TS + 1})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'latest'
|
||||
|
||||
# Test that UpdateItem with the timestamp attribute in AttributeUpdates
|
||||
# uses the given numeric value as the write timestamp, and the timestamp
|
||||
# attribute is NOT stored in the item.
|
||||
def test_timestamp_attribute_update_item_attribute_updates(test_table_ts):
|
||||
p = random_string()
|
||||
# Use UpdateItem with AttributeUpdates, setting 'val' and 'ts'
|
||||
test_table_ts.update_item(
|
||||
Key={'p': p},
|
||||
AttributeUpdates={
|
||||
'val': {'Value': 'hello', 'Action': 'PUT'},
|
||||
'ts': {'Value': LARGE_TS, 'Action': 'PUT'},
|
||||
}
|
||||
)
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'hello'
|
||||
# 'ts' should NOT be stored in the item
|
||||
assert 'ts' not in item
|
||||
|
||||
# Update with a smaller timestamp - should NOT overwrite
|
||||
test_table_ts.update_item(
|
||||
Key={'p': p},
|
||||
AttributeUpdates={
|
||||
'val': {'Value': 'overwritten', 'Action': 'PUT'},
|
||||
'ts': {'Value': SMALL_TS, 'Action': 'PUT'},
|
||||
}
|
||||
)
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
# The item with the larger timestamp should still win
|
||||
assert item['val'] == 'hello'
|
||||
|
||||
# Test that UpdateItem with the timestamp attribute in UpdateExpression
|
||||
# uses the given numeric value as the write timestamp, and the timestamp
|
||||
# attribute is NOT stored in the item.
|
||||
def test_timestamp_attribute_update_item_update_expression(test_table_ts):
|
||||
p = random_string()
|
||||
# Use UpdateItem with UpdateExpression to set 'val' and 'ts'
|
||||
test_table_ts.update_item(
|
||||
Key={'p': p},
|
||||
UpdateExpression='SET val = :v, ts = :t',
|
||||
ExpressionAttributeValues={':v': 'hello', ':t': LARGE_TS}
|
||||
)
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'hello'
|
||||
# 'ts' should NOT be stored in the item
|
||||
assert 'ts' not in item
|
||||
|
||||
# Update with a smaller timestamp - should NOT overwrite
|
||||
test_table_ts.update_item(
|
||||
Key={'p': p},
|
||||
UpdateExpression='SET val = :v, ts = :t',
|
||||
ExpressionAttributeValues={':v': 'overwritten', ':t': SMALL_TS}
|
||||
)
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
# The item with the larger timestamp should still win
|
||||
assert item['val'] == 'hello'
|
||||
|
||||
# Test that when the timestamp attribute is not present in the write request,
|
||||
# the operation behaves normally (no custom timestamp is applied).
|
||||
def test_timestamp_attribute_absent(test_table_ts):
|
||||
p = random_string()
|
||||
# Put item without the timestamp attribute
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello'})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'hello'
|
||||
# No 'ts' attribute expected either
|
||||
assert 'ts' not in item
|
||||
|
||||
# Test that using a condition expression (which requires LWT) together with
|
||||
# the timestamp attribute is rejected.
|
||||
def test_timestamp_attribute_with_condition_rejected(test_table_ts):
|
||||
p = random_string()
|
||||
# Put an initial item (no timestamp attribute, so LWT is ok)
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'initial'})
|
||||
# Try to put with a ConditionExpression and a timestamp - should be rejected
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts.put_item(
|
||||
Item={'p': p, 'val': 'updated', 'ts': LARGE_TS},
|
||||
ConditionExpression='attribute_exists(p)'
|
||||
)
|
||||
|
||||
# Test that using the timestamp attribute with the 'always' write isolation
|
||||
# policy is rejected, because in always_use_lwt mode every write uses LWT
|
||||
# (including unconditional ones), which is incompatible with custom timestamps.
|
||||
def test_timestamp_attribute_lwt_always_rejected(test_table_ts_lwt):
|
||||
p = random_string()
|
||||
# Even a plain PutItem with a timestamp is rejected in LWT_ALWAYS mode
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts_lwt.put_item(Item={'p': p, 'val': 'hello', 'ts': LARGE_TS})
|
||||
|
||||
# Test that when the timestamp attribute has a non-numeric value, the write
|
||||
# is rejected with a ValidationException.
|
||||
def test_timestamp_attribute_non_numeric(test_table_ts):
|
||||
p = random_string()
|
||||
# Put item with the timestamp attribute as a string (non-numeric) - should fail
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello', 'ts': 'not_a_number'})
|
||||
|
||||
# Test that the timestamp attribute tag can be set on a table with a sort key.
|
||||
def test_timestamp_attribute_with_range_key(test_table_ts_ss):
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
# Write with a large timestamp
|
||||
test_table_ts_ss.put_item(Item={'p': p, 'c': c, 'val': 'large', 'ts': LARGE_TS})
|
||||
# Write with a small timestamp (should lose)
|
||||
test_table_ts_ss.put_item(Item={'p': p, 'c': c, 'val': 'small', 'ts': SMALL_TS})
|
||||
item = test_table_ts_ss.get_item(Key={'p': p, 'c': c}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'large'
|
||||
assert 'ts' not in item
|
||||
|
||||
# Test that the timestamp attribute value is interpreted in microseconds since
|
||||
# the Unix epoch, and that writes with and without explicit timestamps interact
|
||||
# correctly.
|
||||
def test_timestamp_attribute_microseconds(test_table_ts):
|
||||
# Get current time in microseconds from the Python client side.
|
||||
now_us = int(time.time() * 1_000_000)
|
||||
one_hour_us = 3600 * 1_000_000
|
||||
|
||||
# Part 1: write with the current time as the explicit timestamp, then
|
||||
# overwrite without an explicit timestamp. The second write uses the
|
||||
# server's current time (which is >= now_us), so it should win.
|
||||
p = random_string()
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'old', 'ts': Decimal(str(now_us))})
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'new'})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'new'
|
||||
|
||||
# Part 2: write with a timestamp one hour in the future, then overwrite
|
||||
# without an explicit timestamp. The server's current time (≈ now_us) is
|
||||
# much less than now_us + one_hour_us, so the first write should win.
|
||||
p = random_string()
|
||||
future_us = now_us + one_hour_us
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'future', 'ts': Decimal(str(future_us))})
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'now'})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'future'
|
||||
|
||||
# Test that BatchWriteItem also respects the timestamp attribute.
|
||||
def test_timestamp_attribute_batch_write(test_table_ts):
|
||||
p = random_string()
|
||||
# Write item via BatchWriteItem with a large timestamp
|
||||
with test_table_ts.batch_writer() as batch:
|
||||
batch.put_item(Item={'p': p, 'val': 'large_ts', 'ts': LARGE_TS})
|
||||
# Write item via BatchWriteItem with a small timestamp (should lose)
|
||||
with test_table_ts.batch_writer() as batch:
|
||||
batch.put_item(Item={'p': p, 'val': 'small_ts', 'ts': SMALL_TS})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'large_ts'
|
||||
assert 'ts' not in item
|
||||
|
||||
# Test that DeleteItem respects the timestamp attribute: a delete with a
|
||||
# smaller timestamp than the item's write timestamp should not take effect.
|
||||
def test_timestamp_attribute_delete_item(test_table_ts):
|
||||
p = random_string()
|
||||
# Write an item with a large timestamp
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello', 'ts': LARGE_TS})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
assert item['val'] == 'hello'
|
||||
# Delete with a small timestamp - the delete should lose (item still exists)
|
||||
test_table_ts.delete_item(Key={'p': p, 'ts': SMALL_TS})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True).get('Item')
|
||||
assert item is not None and item['val'] == 'hello'
|
||||
# Delete with a large timestamp - the delete should win (item is removed)
|
||||
test_table_ts.delete_item(Key={'p': p, 'ts': LARGE_TS + 1})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True).get('Item')
|
||||
assert item is None
|
||||
|
||||
# Test that DeleteItem without the timestamp attribute in the key behaves
|
||||
# normally (no custom timestamp is applied).
|
||||
def test_timestamp_attribute_delete_item_no_ts(test_table_ts):
|
||||
p = random_string()
|
||||
# Use SMALL_TS so the delete (which uses the current server time) wins.
|
||||
# If we used LARGE_TS (far future), the delete without an explicit timestamp
|
||||
# would use current time which is smaller than LARGE_TS and the delete would lose.
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello', 'ts': SMALL_TS})
|
||||
# Delete without a timestamp attribute - should succeed normally
|
||||
test_table_ts.delete_item(Key={'p': p})
|
||||
assert 'Item' not in test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)
|
||||
# Verify that an item written with a far-future timestamp is NOT deleted by
|
||||
# a delete without an explicit timestamp (server time < LARGE_TS).
|
||||
p = random_string()
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello', 'ts': LARGE_TS})
|
||||
test_table_ts.delete_item(Key={'p': p})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True).get('Item')
|
||||
assert item is not None and item['val'] == 'hello'
|
||||
|
||||
# Test that DeleteItem with a non-numeric timestamp attribute is rejected.
|
||||
def test_timestamp_attribute_delete_item_non_numeric(test_table_ts):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts.delete_item(Key={'p': p, 'ts': 'not_a_number'})
|
||||
|
||||
# Test that BatchWriteItem DeleteRequest also respects the timestamp attribute.
|
||||
def test_timestamp_attribute_batch_delete(test_table_ts):
|
||||
p = random_string()
|
||||
# Write an item with a large timestamp
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello', 'ts': LARGE_TS})
|
||||
# Delete via BatchWriteItem with a small timestamp - delete should lose
|
||||
test_table_ts.meta.client.batch_write_item(RequestItems={
|
||||
test_table_ts.name: [{'DeleteRequest': {'Key': {'p': p, 'ts': SMALL_TS}}}]
|
||||
})
|
||||
item = test_table_ts.get_item(Key={'p': p}, ConsistentRead=True).get('Item')
|
||||
assert item is not None and item['val'] == 'hello'
|
||||
# Delete via BatchWriteItem with a large timestamp - delete should win
|
||||
test_table_ts.meta.client.batch_write_item(RequestItems={
|
||||
test_table_ts.name: [{'DeleteRequest': {'Key': {'p': p, 'ts': LARGE_TS + 1}}}]
|
||||
})
|
||||
assert 'Item' not in test_table_ts.get_item(Key={'p': p}, ConsistentRead=True)
|
||||
|
||||
# Test that DeleteItem with a ConditionExpression and a custom timestamp is
|
||||
# rejected, because conditional writes require LWT which is incompatible with
|
||||
# custom timestamps.
|
||||
def test_timestamp_attribute_delete_item_condition_rejected(test_table_ts):
|
||||
p = random_string()
|
||||
test_table_ts.put_item(Item={'p': p, 'val': 'hello'})
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts.delete_item(
|
||||
Key={'p': p, 'ts': SMALL_TS},
|
||||
ConditionExpression='attribute_exists(p)'
|
||||
)
|
||||
|
||||
# Test that DeleteItem with a custom timestamp is rejected when the table uses
|
||||
# always_use_lwt isolation, because every write uses LWT in that mode.
|
||||
def test_timestamp_attribute_delete_item_lwt_always_rejected(test_table_ts_lwt):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts_lwt.delete_item(Key={'p': p, 'ts': SMALL_TS})
|
||||
|
||||
# Test that BatchWriteItem PutRequest with a custom timestamp is rejected when
|
||||
# the table uses always_use_lwt isolation.
|
||||
def test_timestamp_attribute_batch_put_lwt_always_rejected(test_table_ts_lwt):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts_lwt.meta.client.batch_write_item(RequestItems={
|
||||
test_table_ts_lwt.name: [{'PutRequest': {'Item': {'p': p, 'val': 'v', 'ts': SMALL_TS}}}]
|
||||
})
|
||||
|
||||
# Test that BatchWriteItem DeleteRequest with a custom timestamp is rejected
|
||||
# when the table uses always_use_lwt isolation.
|
||||
def test_timestamp_attribute_batch_delete_lwt_always_rejected(test_table_ts_lwt):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table_ts_lwt.meta.client.batch_write_item(RequestItems={
|
||||
test_table_ts_lwt.name: [{'DeleteRequest': {'Key': {'p': p, 'ts': SMALL_TS}}}]
|
||||
})
|
||||
153
test/boost/auth_cache_test.cc
Normal file
153
test/boost/auth_cache_test.cc
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
* Copyright (C) 2026-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
*/
|
||||
|
||||
#include <seastar/core/future.hh>
|
||||
|
||||
#undef SEASTAR_TESTING_MAIN
|
||||
#include <seastar/testing/test_case.hh>
|
||||
|
||||
#include "auth/cache.hh"
|
||||
#include "auth/permission.hh"
|
||||
#include "auth/resource.hh"
|
||||
#include "auth/role_or_anonymous.hh"
|
||||
#include "auth/service.hh"
|
||||
#include "db/config.hh"
|
||||
#include "test/lib/cql_test_env.hh"
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(auth_cache_test)
|
||||
|
||||
static cql_test_config auth_on_config() {
|
||||
cql_test_config cfg;
|
||||
cfg.db_config->authorizer("CassandraAuthorizer");
|
||||
cfg.db_config->authenticator("PasswordAuthenticator");
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// Tests whether removing role drops it from cache.
|
||||
SEASTAR_TEST_CASE(test_update_cache_for_dropped_role) {
|
||||
return do_with_cql_env([](cql_test_env& env) -> future<> {
|
||||
co_await env.execute_cql("CREATE ROLE to_drop WITH LOGIN = false AND PASSWORD = 'x'");
|
||||
|
||||
auto& cache = env.auth_cache().local();
|
||||
co_await cache.load_all();
|
||||
|
||||
BOOST_REQUIRE(cache.get("to_drop"));
|
||||
|
||||
co_await env.execute_cql("DROP ROLE to_drop");
|
||||
co_await cache.load_roles({"to_drop"});
|
||||
|
||||
BOOST_REQUIRE(!cache.get("to_drop"));
|
||||
}, auth_on_config());
|
||||
}
|
||||
|
||||
// Tests whether altering base role's permission refreshes
|
||||
// all descendant roles permissions. And then if dropping
|
||||
// base role also drops permissions for descendants.
|
||||
SEASTAR_TEST_CASE(test_three_level_role_hierarchy) {
|
||||
return do_with_cql_env([](cql_test_env& env) -> future<> {
|
||||
co_await env.execute_cql("CREATE TABLE ks.tbl_a (pk int PRIMARY KEY, v int)");
|
||||
co_await env.execute_cql("CREATE TABLE ks.tbl_b (pk int PRIMARY KEY, v int)");
|
||||
|
||||
co_await env.execute_cql("CREATE ROLE r_base WITH LOGIN = false");
|
||||
co_await env.execute_cql("CREATE ROLE r_mid WITH LOGIN = false");
|
||||
co_await env.execute_cql("CREATE ROLE r_bottom WITH LOGIN = true AND PASSWORD = 'x'");
|
||||
|
||||
co_await env.execute_cql("GRANT r_base TO r_mid");
|
||||
co_await env.execute_cql("GRANT r_mid TO r_bottom");
|
||||
|
||||
co_await env.execute_cql("GRANT SELECT ON ks.tbl_a TO r_base");
|
||||
co_await env.execute_cql("GRANT MODIFY ON ks.tbl_b TO r_mid");
|
||||
co_await env.execute_cql("GRANT CREATE ON KEYSPACE ks TO r_bottom");
|
||||
co_await env.execute_cql("GRANT MODIFY ON ks.tbl_a TO r_bottom");
|
||||
|
||||
auto& cache = env.auth_cache().local();
|
||||
co_await cache.load_all();
|
||||
|
||||
auto base = auth::role_or_anonymous("r_base");
|
||||
auto mid = auth::role_or_anonymous("r_mid");
|
||||
auto bottom = auth::role_or_anonymous("r_bottom");
|
||||
auto res_tbl_a = auth::make_data_resource("ks", "tbl_a");
|
||||
auto res_tbl_b = auth::make_data_resource("ks", "tbl_b");
|
||||
|
||||
// Initial check of relationships.
|
||||
BOOST_REQUIRE(cache.get("r_base")->member_of.empty());
|
||||
BOOST_REQUIRE(cache.get("r_mid")->member_of.contains("r_base"));
|
||||
BOOST_REQUIRE(cache.get("r_bottom")->member_of.contains("r_mid"));
|
||||
|
||||
// Update base permissions.
|
||||
co_await env.execute_cql("REVOKE SELECT ON ks.tbl_a FROM r_base");
|
||||
co_await env.execute_cql("GRANT ALTER ON ks.tbl_a TO r_base");
|
||||
co_await cache.load_roles({"r_base"});
|
||||
|
||||
auto perms_mid_a = co_await cache.get_permissions(mid, res_tbl_a);
|
||||
BOOST_REQUIRE(!perms_mid_a.contains(auth::permission::SELECT));
|
||||
BOOST_REQUIRE(perms_mid_a.contains(auth::permission::ALTER));
|
||||
|
||||
auto perms_bottom_a = co_await cache.get_permissions(bottom, res_tbl_a);
|
||||
BOOST_REQUIRE(!perms_bottom_a.contains(auth::permission::SELECT));
|
||||
BOOST_REQUIRE(perms_bottom_a.contains(auth::permission::ALTER));
|
||||
|
||||
// Drop base role.
|
||||
co_await env.execute_cql("DROP ROLE r_base");
|
||||
co_await cache.load_roles({"r_base"});
|
||||
|
||||
perms_mid_a = co_await cache.get_permissions(mid, res_tbl_a);
|
||||
BOOST_REQUIRE_EQUAL(perms_mid_a.mask(), auth::permissions::NONE.mask());
|
||||
|
||||
perms_bottom_a = co_await cache.get_permissions(bottom, res_tbl_a);
|
||||
auth::permission_set expected;
|
||||
expected.set(auth::permission::MODIFY);
|
||||
BOOST_REQUIRE(perms_bottom_a.mask() == expected.mask());
|
||||
}, auth_on_config());
|
||||
}
|
||||
|
||||
// Tests whether resource drop properly removes permissions.
|
||||
SEASTAR_TEST_CASE(test_invalidate_permissions_via_drop) {
|
||||
return do_with_cql_env([](cql_test_env& env) -> future<> {
|
||||
co_await env.execute_cql("CREATE TABLE ks.t (pk int PRIMARY KEY)");
|
||||
co_await env.execute_cql("CREATE ROLE user");
|
||||
co_await env.execute_cql("GRANT SELECT ON ks.t TO user");
|
||||
|
||||
auto& cache = env.auth_cache().local();
|
||||
co_await cache.load_all();
|
||||
|
||||
auto r = auth::make_data_resource("ks", "t");
|
||||
auto role = auth::role_or_anonymous("user");
|
||||
|
||||
auto perms_before = co_await cache.get_permissions(role, r);
|
||||
BOOST_REQUIRE(perms_before.contains(auth::permission::SELECT));
|
||||
|
||||
co_await env.execute_cql("DROP TABLE ks.t");
|
||||
co_await cache.prune(r);
|
||||
|
||||
auto perms_after = co_await cache.get_permissions(role, r);
|
||||
BOOST_REQUIRE(!perms_after.contains(auth::permission::SELECT));
|
||||
}, auth_on_config());
|
||||
}
|
||||
|
||||
// Checks if permissions are not accidentally added to anonymous role.
|
||||
SEASTAR_TEST_CASE(test_anonymous_not_granted_other_roles_permissions) {
|
||||
return do_with_cql_env([](cql_test_env& env) -> future<> {
|
||||
co_await env.execute_cql("CREATE TABLE ks.t (pk int PRIMARY KEY)");
|
||||
co_await env.execute_cql("CREATE ROLE granted_role");
|
||||
co_await env.execute_cql("GRANT SELECT ON ks.t TO granted_role");
|
||||
|
||||
auto& cache = env.auth_cache().local();
|
||||
co_await cache.load_all();
|
||||
|
||||
auto r = auth::make_data_resource("ks", "t");
|
||||
|
||||
auto role_perms = co_await cache.get_permissions(auth::role_or_anonymous("granted_role"), r);
|
||||
BOOST_REQUIRE(role_perms.contains(auth::permission::SELECT));
|
||||
|
||||
auto anon_perms = co_await cache.get_permissions(auth::role_or_anonymous(), r);
|
||||
BOOST_REQUIRE(!anon_perms.contains(auth::permission::SELECT));
|
||||
}, auth_on_config());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
@@ -1650,6 +1650,21 @@ future<> handle_resize_finalize(cql_test_env& e, group0_guard& guard, const migr
|
||||
}
|
||||
}
|
||||
|
||||
static future<> apply_repair_transitions(token_metadata& tm, const migration_plan& plan) {
|
||||
for (const auto& repair : plan.repair_plan().repairs()) {
|
||||
co_await tm.tablets().mutate_tablet_map_async(repair.table, [&] (tablet_map& tmap) {
|
||||
auto tablet_info = tmap.get_tablet_info(repair.tablet);
|
||||
tmap.set_tablet_transition_info(repair.tablet, tablet_transition_info{
|
||||
tablet_transition_stage::repair,
|
||||
tablet_transition_kind::repair,
|
||||
tablet_info.replicas,
|
||||
std::nullopt,
|
||||
});
|
||||
return make_ready_future();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Reflects the plan in a given token metadata as if the migrations were fully executed.
|
||||
static
|
||||
future<> apply_plan(token_metadata& tm, const migration_plan& plan, service::topology& topology, shared_load_stats* load_stats) {
|
||||
@@ -1674,6 +1689,7 @@ future<> apply_plan(token_metadata& tm, const migration_plan& plan, service::top
|
||||
if (auto request_id = plan.rack_list_colocation_plan().request_to_resume(); request_id) {
|
||||
topology.paused_rf_change_requests.erase(request_id);
|
||||
}
|
||||
co_await apply_repair_transitions(tm, plan);
|
||||
}
|
||||
|
||||
// Reflects the plan in a given token metadata as if the migrations were started but not yet executed.
|
||||
@@ -5995,4 +6011,168 @@ SEASTAR_THREAD_TEST_CASE(test_tablets_describe_ring) {
|
||||
}, cfg).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_auto_repair_rf1) {
|
||||
cql_test_config cfg_in;
|
||||
cfg_in.db_config->auto_repair_enabled_default(true);
|
||||
cfg_in.db_config->auto_repair_threshold_default_in_seconds(1);
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
unsigned shard_count = 1;
|
||||
auto dc1 = topo.dc();
|
||||
auto rack1 = topo.rack();
|
||||
[[maybe_unused]] auto host1 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack2 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host2 = topo.add_node(node_state::normal, shard_count);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{dc1, 1}}, 1);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
tablet_id tablet{0};
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(1);
|
||||
auto tid = tmap.first_tablet();
|
||||
tablet = tid;
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
}
|
||||
});
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
bool once = false;
|
||||
rebalance_tablets(e, nullptr, {}, [&once] (const migration_plan& plan) { return std::exchange(once, true); });
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).get_tablet_transition_info(tablet) == nullptr);
|
||||
}, std::move(cfg_in)).get();
|
||||
}
|
||||
|
||||
void run_tablet_manual_repair_rf1(cql_test_env& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
unsigned shard_count = 1;
|
||||
auto dc1 = topo.dc();
|
||||
auto rack1 = topo.rack();
|
||||
[[maybe_unused]] auto host1 = topo.add_node(node_state::normal, shard_count);
|
||||
auto rack2 = topo.start_new_rack();
|
||||
[[maybe_unused]] auto host2 = topo.add_node(node_state::normal, shard_count);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{dc1, 1}}, 1);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
tablet_id tablet{0};
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(1);
|
||||
auto tid = tmap.first_tablet();
|
||||
tablet = tid;
|
||||
tablet_info ti{
|
||||
tablet_replica_set {
|
||||
tablet_replica{host1, 0},
|
||||
}
|
||||
};
|
||||
ti.repair_task_info = ti.repair_task_info.make_user_repair_request();
|
||||
tmap.set_tablet(tid, std::move(ti));
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
bool once = false;
|
||||
rebalance_tablets(e, nullptr, {}, [&once] (const migration_plan& plan) { return std::exchange(once, true); });
|
||||
BOOST_REQUIRE(stm.get()->tablets().get_tablet_map(table1).get_tablet_transition_info(tablet)->transition == tablet_transition_kind::repair);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_manual_repair_rf1_auto_repair_off) {
|
||||
cql_test_config cfg_in;
|
||||
cfg_in.db_config->auto_repair_enabled_default(false);
|
||||
do_with_cql_env_thread(run_tablet_manual_repair_rf1, std::move(cfg_in)).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tablet_manual_repair_rf1_auto_repair_on) {
|
||||
cql_test_config cfg_in;
|
||||
cfg_in.db_config->auto_repair_enabled_default(true);
|
||||
do_with_cql_env_thread(run_tablet_manual_repair_rf1, std::move(cfg_in)).get();
|
||||
}
|
||||
|
||||
// Test for tablet_map::get_secondary_replica() and specifically how it
|
||||
// relates to get_primary_replica().
|
||||
// We never officially documented given a list of replicas, which replica
|
||||
// is to be considered the "primary" - it's not simply the first replica in
|
||||
// the list but the first in some reshuffling of the list, reshuffling whose
|
||||
// details changed in commits like 817fdad and d88036d. So this patch doesn't
|
||||
// enshrine what get_primary_replica() or get_secondary_replica() should
|
||||
// return. It just verifies that get_secondary_replica() returns a *different*
|
||||
// replica than get_primary_replica() if there are 2 or more replicas, or
|
||||
// throws an error when there's just one replica.
|
||||
// Reproduces SCYLLADB-777.
|
||||
SEASTAR_THREAD_TEST_CASE(test_get_secondary_replica) {
|
||||
auto h1 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
auto h2 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
auto h3 = host_id(utils::UUID_gen::get_time_UUID());
|
||||
|
||||
locator::topology::config cfg = {
|
||||
.this_endpoint = inet_address("127.0.0.1"),
|
||||
.this_host_id = h1,
|
||||
.local_dc_rack = endpoint_dc_rack::default_location,
|
||||
};
|
||||
auto topo = locator::topology(cfg);
|
||||
topo.add_or_update_endpoint(h1, endpoint_dc_rack::default_location, node::state::normal);
|
||||
topo.add_or_update_endpoint(h2, endpoint_dc_rack::default_location, node::state::normal);
|
||||
topo.add_or_update_endpoint(h3, endpoint_dc_rack::default_location, node::state::normal);
|
||||
|
||||
// With 1 replica, get_secondary_replica should throw.
|
||||
{
|
||||
tablet_map tmap(1);
|
||||
auto tid = tmap.first_tablet();
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {h1, 0},
|
||||
}
|
||||
});
|
||||
BOOST_REQUIRE_THROW(tmap.get_secondary_replica(tid, topo), std::runtime_error);
|
||||
}
|
||||
|
||||
// With 2 replicas, get_secondary_replica should return a different replica
|
||||
// than get_primary_replica for every tablet.
|
||||
{
|
||||
tablet_map tmap(4);
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {h1, 0},
|
||||
tablet_replica {h2, 0},
|
||||
}
|
||||
});
|
||||
}
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
auto primary = tmap.get_primary_replica(tid, topo);
|
||||
auto secondary = tmap.get_secondary_replica(tid, topo);
|
||||
BOOST_REQUIRE(primary != secondary);
|
||||
}
|
||||
}
|
||||
|
||||
// With 3 replicas, same check.
|
||||
{
|
||||
tablet_map tmap(4);
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {h1, 0},
|
||||
tablet_replica {h2, 0},
|
||||
tablet_replica {h3, 0},
|
||||
}
|
||||
});
|
||||
}
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
auto primary = tmap.get_primary_replica(tid, topo);
|
||||
auto secondary = tmap.get_secondary_replica(tid, topo);
|
||||
BOOST_REQUIRE(primary != secondary);
|
||||
}
|
||||
}
|
||||
|
||||
topo.clear_gently().get();
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
||||
90
test/cluster/auth_cluster/test_auth_cache_metrics.py
Normal file
90
test/cluster/auth_cluster/test_auth_cache_metrics.py
Normal file
@@ -0,0 +1,90 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
|
||||
from cassandra.auth import PlainTextAuthProvider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_auth_cache_metrics(manager: ManagerClient):
|
||||
"""
|
||||
Verify that auth cache metrics correctly track roles and permissions
|
||||
"""
|
||||
smp = 4
|
||||
servers = await manager.servers_add(1, cmdline=[f'--smp={smp}'], config=auth_config)
|
||||
cql, _ = await manager.get_ready_cql(servers)
|
||||
node = servers[0]
|
||||
|
||||
async def get_metric(node_ip, metric_name, sum_all_shards=False):
|
||||
metrics = await manager.metrics.query(node_ip)
|
||||
shard_zero_val = metrics.get(metric_name, {'shard': '0'})
|
||||
total = 0.0
|
||||
all_vals = [total]
|
||||
for shard in range(smp):
|
||||
shard_val = metrics.get(metric_name, {'shard': str(shard)})
|
||||
if not sum_all_shards:
|
||||
assert shard_val == shard_zero_val, f"metric {metric_name} differs for shard {shard}"
|
||||
else:
|
||||
total += float(shard_val)
|
||||
if sum_all_shards:
|
||||
return total
|
||||
else:
|
||||
return shard_zero_val
|
||||
|
||||
initial_roles = await get_metric(node.ip_addr, "scylla_auth_cache_roles")
|
||||
initial_perms = await get_metric(node.ip_addr, "scylla_auth_cache_permissions", sum_all_shards=True)
|
||||
logger.info(f"Initial metrics - Roles: {initial_roles}, Permissions: {initial_perms}")
|
||||
|
||||
assert initial_roles == 1
|
||||
assert initial_perms == 0
|
||||
|
||||
new_roles_count = 10
|
||||
roles = [f"metric_test_role_{i}" for i in range(new_roles_count)]
|
||||
|
||||
logger.info(f"Creating new roles")
|
||||
for role_name in roles:
|
||||
await cql.run_async(f"CREATE ROLE {role_name} WITH PASSWORD = 'password' AND LOGIN = true")
|
||||
# This should results in adding 2 permissions (1 for each of 2 resources)
|
||||
await cql.run_async(f"GRANT SELECT ON KEYSPACE system TO {role_name}")
|
||||
await cql.run_async(f"GRANT MODIFY ON ALL KEYSPACES TO {role_name}")
|
||||
|
||||
|
||||
logger.info(f"Log in to each role to lazy cache permissions")
|
||||
for role_name in roles:
|
||||
await manager.driver_connect(auth_provider=PlainTextAuthProvider(username=role_name, password="password"))
|
||||
cql = manager.get_cql()
|
||||
# This loads permissions for system keyspace resource, on a single shard
|
||||
await cql.run_async(f"SELECT * FROM system.roles")
|
||||
|
||||
logger.info(f"Log in back to cassandra")
|
||||
await manager.driver_connect(auth_provider=PlainTextAuthProvider(username="cassandra", password="cassandra"))
|
||||
cql = manager.get_cql()
|
||||
|
||||
current_roles = await get_metric(node.ip_addr, "scylla_auth_cache_roles")
|
||||
current_perms = await get_metric(node.ip_addr, "scylla_auth_cache_permissions", sum_all_shards=True)
|
||||
|
||||
logger.info(f"After addition - Roles: {current_roles}, Permissions: {current_perms}")
|
||||
|
||||
cassandra_perms_count = 2 # Creating roles causes data and data/system resources to be added
|
||||
|
||||
assert current_roles == initial_roles + new_roles_count
|
||||
assert current_perms == initial_perms + 2*new_roles_count + cassandra_perms_count
|
||||
|
||||
logger.info(f"Dropping created roles")
|
||||
for role_name in roles:
|
||||
await cql.run_async(f"DROP ROLE {role_name}")
|
||||
|
||||
final_roles = await get_metric(node.ip_addr, "scylla_auth_cache_roles")
|
||||
final_perms = await get_metric(node.ip_addr, "scylla_auth_cache_permissions", sum_all_shards=True)
|
||||
|
||||
logger.info(f"After cleanup - Roles: {final_roles}, Permissions: {final_perms}")
|
||||
|
||||
assert final_roles == initial_roles
|
||||
assert final_perms == initial_perms
|
||||
92
test/cluster/auth_cluster/test_startup_response.py
Normal file
92
test/cluster/auth_cluster/test_startup_response.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import pytest
|
||||
import logging
|
||||
import sys
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import time
|
||||
from unittest import mock
|
||||
from cassandra.cluster import Cluster, DefaultConnection, NoHostAvailable
|
||||
from cassandra import connection
|
||||
from cassandra.auth import PlainTextAuthProvider
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_startup_no_auth_response(manager: ManagerClient, build_mode):
|
||||
"""
|
||||
Test behavior when client hangs on startup auth response.
|
||||
This is stressing uninitialized_connections_semaphore_cpu_concurrency
|
||||
switching between CPU and Network states (1 or 0 semaphore units taken
|
||||
per connection).
|
||||
Test is probabilistic in the sense that it triggers bug reliably
|
||||
only with sufficient `num_connections` but empirically this number
|
||||
is tested to be very low.
|
||||
"""
|
||||
server = await manager.server_add(config=auth_config)
|
||||
|
||||
# Define a custom connection class that hangs on startup response
|
||||
class NoOpConnection(DefaultConnection):
|
||||
def _handle_startup_response(self, startup_response):
|
||||
pass
|
||||
|
||||
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
|
||||
|
||||
connections_observed = False
|
||||
num_connections = 500
|
||||
timeout = 180
|
||||
|
||||
def attempt_bad_connection():
|
||||
c = Cluster([server.ip_addr], port=9042, auth_provider=auth_provider, connect_timeout=timeout, connection_class=NoOpConnection)
|
||||
try:
|
||||
c.connect()
|
||||
pytest.fail("Should not connect")
|
||||
except Exception:
|
||||
# We expect failure or timeout
|
||||
pass
|
||||
finally:
|
||||
c.shutdown()
|
||||
|
||||
def attempt_good_connection():
|
||||
nonlocal connections_observed
|
||||
c = Cluster([server.ip_addr], port=9042, auth_provider=auth_provider, connect_timeout=timeout/3)
|
||||
try:
|
||||
session = c.connect()
|
||||
res = session.execute("SELECT COUNT(*) FROM system.clients WHERE connection_stage = 'AUTHENTICATING' ALLOW FILTERING;")
|
||||
count = res[0][0]
|
||||
logging.info(f"Observed {count} AUTHENTICATING connections...")
|
||||
if count >= num_connections/2:
|
||||
connections_observed = True
|
||||
finally:
|
||||
c.shutdown()
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
logging.info("Attempting concurrent connections with custom hanging connection class...")
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=num_connections + 1)
|
||||
|
||||
async def verify_loop():
|
||||
logging.info("Verifying server availability concurrently...")
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
logging.info(f"Good connection attempt at delta {time.time() - start_time:.2f}s")
|
||||
try:
|
||||
await loop.run_in_executor(executor, attempt_good_connection)
|
||||
except Exception as e:
|
||||
logging.info(f"Good connection attempt failed: {e}")
|
||||
if connections_observed:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
logging.info("Verification loop completed")
|
||||
|
||||
good_future = asyncio.create_task(verify_loop())
|
||||
bad_futures = [loop.run_in_executor(executor, attempt_bad_connection) for _ in range(num_connections)]
|
||||
|
||||
await good_future
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
assert connections_observed
|
||||
@@ -782,12 +782,12 @@ class TestCQLAudit(AuditTester):
|
||||
|
||||
def test_audit_type_none(self):
|
||||
"""
|
||||
'audit': None
|
||||
'audit': none
|
||||
CREATE KEYSPACE, USE KEYSPACE, ALTER KEYSPACE, DROP KEYSPACE statements
|
||||
check audit KS not created
|
||||
"""
|
||||
|
||||
audit_settings = {"audit": None, "audit_categories": "ADMIN,AUTH,QUERY,DML,DDL,DCL", "audit_keyspaces": "ks"}
|
||||
audit_settings = {"audit": "none", "audit_categories": "ADMIN,AUTH,QUERY,DML,DDL,DCL", "audit_keyspaces": "ks"}
|
||||
|
||||
session = self.prepare(create_keyspace=False, audit_settings=audit_settings)
|
||||
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
from cassandra.cluster import Session
|
||||
from cassandra.protocol import ConfigurationException, InvalidRequest
|
||||
|
||||
from dtest_class import Tester
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_ks_and_assert_warning(session, query, ks_name, key_warn_msg_words):
|
||||
ret = session.execute_async(query)
|
||||
_ = ret.result()
|
||||
found = False
|
||||
if len(key_warn_msg_words) > 0:
|
||||
assert len(ret.warnings) >= 1, "Expected RF guardrail warning"
|
||||
for warning in ret.warnings:
|
||||
found = found or all(word in warning.lower() for word in key_warn_msg_words)
|
||||
assert found, "Didn't match all required keywords"
|
||||
session.execute(f"USE {ks_name}")
|
||||
|
||||
|
||||
def assert_creating_ks_fails(session, query, ks_name):
|
||||
with pytest.raises(ConfigurationException):
|
||||
session.execute(query)
|
||||
with pytest.raises(InvalidRequest):
|
||||
session.execute(f"USE {ks_name}")
|
||||
|
||||
|
||||
@pytest.mark.next_gating
|
||||
class TestGuardrails(Tester):
|
||||
def test_default_rf(self):
|
||||
"""
|
||||
As of now, the only RF guardrail enabled is a soft limit checking that RF >= 3. Not complying to this soft limit
|
||||
results in a CQL being executed, but with a warning. Also, whatever the guardrails' values, RF = 0 is always OK.
|
||||
"""
|
||||
cluster = self.cluster
|
||||
|
||||
# FIXME: This test verifies that guardrails work. However, if we set `rf_rack_valid_keyspaces` to true,
|
||||
# we'll get a different error, so let's disable it for now. For more context, see issues:
|
||||
# scylladb/scylladb#23071 and scylladb/scylla-dtest#5633.
|
||||
cluster.set_configuration_options(values={"rf_rack_valid_keyspaces": False})
|
||||
|
||||
cluster.populate([1, 1, 1]).start(wait_other_notice=True)
|
||||
session_dc1: Session = self.patient_cql_connection(cluster.nodelist()[0])
|
||||
|
||||
ks_name = "ks"
|
||||
rf = {"dc1": 2, "dc2": 3, "dc3": 0}
|
||||
query = "CREATE KEYSPACE %s WITH REPLICATION={%s}"
|
||||
options = ", ".join(["'%s':%d" % (dc_value, rf_value) for dc_value, rf_value in rf.items()])
|
||||
query = query % (ks_name, "'class':'NetworkTopologyStrategy', %s" % options)
|
||||
create_ks_and_assert_warning(session_dc1, query, ks_name, ["warn", "min", "replication", "factor", "3", "dc1", "2"])
|
||||
|
||||
def test_all_rf_limits(self):
|
||||
"""
|
||||
There're 4 limits for RF: soft/hard min and soft/hard max limits. Breaking soft limits issues a warning,
|
||||
breaking the hard limits prevents the query from being executed.
|
||||
"""
|
||||
cluster = self.cluster
|
||||
|
||||
MIN_FAIL_THRESHOLD = 2
|
||||
MIN_WARN_THRESHOLD = 3
|
||||
MAX_WARN_THRESHOLD = 4
|
||||
MAX_FAIL_THRESHOLD = 5
|
||||
|
||||
# FIXME: This test verifies that guardrails work. However, if we set `rf_rack_valid_keyspaces` to true,
|
||||
# we'll get a different error, so let's disable it for now. For more context, see issues:
|
||||
# scylladb/scylladb#23071 and scylladb/scylla-dtest#5633.
|
||||
cluster.set_configuration_options(values={"rf_rack_valid_keyspaces": False})
|
||||
|
||||
cluster.set_configuration_options(
|
||||
values={
|
||||
"minimum_replication_factor_fail_threshold": MIN_FAIL_THRESHOLD, "minimum_replication_factor_warn_threshold": MIN_WARN_THRESHOLD, "maximum_replication_factor_warn_threshold": MAX_WARN_THRESHOLD,
|
||||
"maximum_replication_factor_fail_threshold": MAX_FAIL_THRESHOLD
|
||||
}
|
||||
)
|
||||
|
||||
query = "CREATE KEYSPACE %s WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1': %s}"
|
||||
cluster.populate([1]).start()
|
||||
node = cluster.nodelist()[0]
|
||||
session = self.patient_cql_connection(node)
|
||||
|
||||
def test_rf(rf):
|
||||
ks_name = f"ks_{rf}"
|
||||
if rf < MIN_FAIL_THRESHOLD or rf > MAX_FAIL_THRESHOLD:
|
||||
assert_creating_ks_fails(session, query % (ks_name, rf), ks_name)
|
||||
elif rf < MIN_WARN_THRESHOLD:
|
||||
create_ks_and_assert_warning(session, query % (ks_name, rf), ks_name, ["warn", "min", "replication", "factor", str(MIN_WARN_THRESHOLD), "dc1", "2"])
|
||||
elif rf > MAX_WARN_THRESHOLD:
|
||||
create_ks_and_assert_warning(session, query % (ks_name, rf), ks_name, ["warn", "max", "replication", "factor", str(MAX_WARN_THRESHOLD), "dc1", "5"])
|
||||
else:
|
||||
create_ks_and_assert_warning(session, query % (ks_name, rf), ks_name, [])
|
||||
|
||||
for rf in range(MIN_FAIL_THRESHOLD - 1, MAX_FAIL_THRESHOLD + 1):
|
||||
test_rf(rf)
|
||||
@@ -61,8 +61,8 @@ async def test_mv_build_during_shutdown(manager: ManagerClient):
|
||||
# Start building two views. The first is delayed by the injection, and the second
|
||||
# view build is queued, waiting on the view builder semaphore.
|
||||
await manager.api.enable_injection(server.ip_addr, "delay_before_get_view_natural_endpoint", one_shot=True)
|
||||
create_task1 = cql.run_async(f"CREATE materialized view {ks}.t_view1 AS select pk, v from {ks}.t where v is not null primary key (v, pk)")
|
||||
create_task2 = cql.run_async(f"CREATE materialized view {ks}.t_view2 AS select pk, v from {ks}.t where v is not null primary key (v, pk)")
|
||||
await cql.run_async(f"CREATE materialized view {ks}.t_view1 AS select pk, v from {ks}.t where v is not null primary key (v, pk)")
|
||||
await cql.run_async(f"CREATE materialized view {ks}.t_view2 AS select pk, v from {ks}.t where v is not null primary key (v, pk)")
|
||||
|
||||
log = await manager.server_open_log(server.server_id)
|
||||
mark = await log.mark()
|
||||
@@ -80,4 +80,4 @@ async def test_mv_build_during_shutdown(manager: ManagerClient):
|
||||
# For dropping the keyspace
|
||||
await manager.server_start(server.server_id)
|
||||
await reconnect_driver(manager)
|
||||
asyncio.gather(create_task1, create_task2)
|
||||
|
||||
|
||||
@@ -394,7 +394,8 @@ async def test_mv_first_replica_in_dc(manager: ManagerClient, delayed_replica: s
|
||||
@pytest.mark.parametrize("migration_type", ["tablets_internode", "tablets_intranode", "vnodes"])
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_mv_write_during_migration(manager: ManagerClient, migration_type: str):
|
||||
cmdline = ['--smp', '2', '--logger-log-level', 'raft_topology=debug']
|
||||
# RF=1 and fast boot options with streaming don't play well together, so force RBNO for bootstrap
|
||||
cmdline = ['--smp', '2', '--logger-log-level', 'raft_topology=debug', "--allowed-repair-based-node-ops", "replace,removenode,rebuild,bootstrap,decommission"]
|
||||
|
||||
servers = await manager.servers_add(3, cmdline=cmdline)
|
||||
cql = manager.get_cql()
|
||||
|
||||
@@ -30,6 +30,11 @@ def format_tuples(tuples=None, **kwargs):
|
||||
return f'{{ {body} }}'
|
||||
|
||||
|
||||
def keyspace_options(object_storage, rf=1):
|
||||
storage_opts = format_tuples(type=f'{object_storage.type}', endpoint=object_storage.address, bucket=object_storage.bucket_name)
|
||||
return f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} AND STORAGE = {storage_opts}"
|
||||
|
||||
|
||||
class S3_Server:
|
||||
def __init__(self, tempdir: str, address: str, port: int, acc_key: str, secret_key: str, region: str, bucket_name):
|
||||
self.tempdir = tempdir
|
||||
|
||||
@@ -10,7 +10,6 @@ import pytest
|
||||
import time
|
||||
import random
|
||||
|
||||
from test.cqlpy.util import local_process_id
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.object_store.conftest import format_tuples
|
||||
from test.cluster.util import wait_for_cql_and_get_hosts, get_replication, new_test_keyspace
|
||||
@@ -55,8 +54,8 @@ async def prepare_snapshot_for_backup(manager: ManagerClient, server, snap_name=
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_simple_backup(manager: ManagerClient, object_storage):
|
||||
@pytest.mark.parametrize("move_files", [False, True])
|
||||
async def test_simple_backup(manager: ManagerClient, object_storage, move_files):
|
||||
'''check that backing up a snapshot for a keyspace works'''
|
||||
|
||||
objconf = object_storage.create_endpoint_conf()
|
||||
@@ -76,7 +75,7 @@ async def test_simple_backup(manager: ManagerClient, object_storage):
|
||||
|
||||
print('Backup snapshot')
|
||||
prefix = f'{cf}/backup'
|
||||
tid = await manager.api.backup(server.ip_addr, ks, cf, 'backup', object_storage.address, object_storage.bucket_name, prefix)
|
||||
tid = await manager.api.backup(server.ip_addr, ks, cf, 'backup', object_storage.address, object_storage.bucket_name, prefix, move_files=move_files)
|
||||
print(f'Started task {tid}')
|
||||
status = await manager.api.get_task_status(server.ip_addr, tid)
|
||||
print(f'Status: {status}, waiting to finish')
|
||||
@@ -84,6 +83,9 @@ async def test_simple_backup(manager: ManagerClient, object_storage):
|
||||
assert (status is not None) and (status['state'] == 'done')
|
||||
assert (status['progress_total'] > 0) and (status['progress_completed'] == status['progress_total'])
|
||||
|
||||
# all components in the "backup" snapshot should have been moved into bucket if move_files
|
||||
assert len(os.listdir(f'{workdir}/data/{ks}/{cf_dir}/snapshots/backup')) == 0 if move_files else len(files)
|
||||
|
||||
objects = set(o.key for o in object_storage.get_resource().Bucket(object_storage.bucket_name).objects.all())
|
||||
for f in files:
|
||||
print(f'Check {f} is in backup')
|
||||
@@ -95,41 +97,6 @@ async def test_simple_backup(manager: ManagerClient, object_storage):
|
||||
assert len(res) == 1 and res[0][1].group(1) == 'strm'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("move_files", [False, True])
|
||||
async def test_backup_move(manager: ManagerClient, object_storage, move_files):
|
||||
'''check that backing up a snapshot by _moving_ sstable to object storage'''
|
||||
|
||||
objconf = object_storage.create_endpoint_conf()
|
||||
cfg = {'enable_user_defined_functions': False,
|
||||
'object_storage_endpoints': objconf,
|
||||
'experimental_features': ['keyspace-storage-options'],
|
||||
'task_ttl_in_seconds': 300
|
||||
}
|
||||
cmd = ['--logger-log-level', 'snapshots=trace:task_manager=trace:api=info']
|
||||
server = await manager.server_add(config=cfg, cmdline=cmd)
|
||||
ks, cf = await prepare_snapshot_for_backup(manager, server)
|
||||
|
||||
workdir = await manager.server_get_workdir(server.server_id)
|
||||
cf_dir = os.listdir(f'{workdir}/data/{ks}')[0]
|
||||
files = set(os.listdir(f'{workdir}/data/{ks}/{cf_dir}/snapshots/backup'))
|
||||
assert len(files) > 0
|
||||
|
||||
print('Backup snapshot')
|
||||
prefix = f'{cf}/backup'
|
||||
tid = await manager.api.backup(server.ip_addr, ks, cf, 'backup', object_storage.address, object_storage.bucket_name, prefix,
|
||||
move_files=move_files)
|
||||
print(f'Started task {tid}')
|
||||
status = await manager.api.get_task_status(server.ip_addr, tid)
|
||||
print(f'Status: {status}, waiting to finish')
|
||||
status = await manager.api.wait_task(server.ip_addr, tid)
|
||||
assert (status is not None) and (status['state'] == 'done')
|
||||
assert (status['progress_total'] > 0) and (status['progress_completed'] == status['progress_total'])
|
||||
|
||||
# all components in the "backup" snapshot should have been moved into bucket if move_files
|
||||
assert len(os.listdir(f'{workdir}/data/{ks}/{cf_dir}/snapshots/backup')) == 0 if move_files else len(files)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("ne_parameter", [ "endpoint", "bucket", "snapshot" ])
|
||||
async def test_backup_with_non_existing_parameters(manager: ManagerClient, object_storage, ne_parameter):
|
||||
@@ -271,7 +238,9 @@ async def test_backup_is_abortable_in_s3_client(manager: ManagerClient, object_s
|
||||
await do_test_backup_abort(manager, object_storage, breakpoint_name="backup_task_pre_upload", min_files=0, max_files=1)
|
||||
|
||||
|
||||
async def do_test_simple_backup_and_restore(manager: ManagerClient, object_storage, tmpdir, do_encrypt = False, do_abort = False):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(("do_encrypt", "do_abort"), [(False, False), (False, True), (True, False)])
|
||||
async def test_simple_backup_and_restore(manager: ManagerClient, object_storage, tmpdir, do_encrypt, do_abort):
|
||||
'''check that restoring from backed up snapshot for a keyspace:table works'''
|
||||
|
||||
objconf = object_storage.create_endpoint_conf()
|
||||
@@ -383,17 +352,6 @@ async def do_test_simple_backup_and_restore(manager: ManagerClient, object_stora
|
||||
post_objects = set(o.key for o in object_storage.get_resource().Bucket(object_storage.bucket_name).objects.filter(Prefix=prefix))
|
||||
assert objects == post_objects
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_backup_and_restore(manager: ManagerClient, object_storage, tmp_path):
|
||||
'''check that restoring from backed up snapshot for a keyspace:table works'''
|
||||
await do_test_simple_backup_and_restore(manager, object_storage, tmp_path, False, False)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort_simple_backup_and_restore(manager: ManagerClient, object_storage, tmp_path):
|
||||
'''check that restoring from backed up snapshot for a keyspace:table works'''
|
||||
await do_test_simple_backup_and_restore(manager, object_storage, tmp_path, False, True)
|
||||
|
||||
|
||||
|
||||
async def do_abort_restore(manager: ManagerClient, object_storage):
|
||||
# Define configuration for the servers.
|
||||
@@ -517,12 +475,6 @@ async def test_abort_restore_with_rpc_error(manager: ManagerClient, object_stora
|
||||
await do_abort_restore(manager, object_storage)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_simple_backup_and_restore_with_encryption(manager: ManagerClient, object_storage, tmp_path):
|
||||
'''check that restoring from backed up snapshot for a keyspace:table works'''
|
||||
await do_test_simple_backup_and_restore(manager, object_storage, tmp_path, True, False)
|
||||
|
||||
# Helper class to parametrize the test below
|
||||
class topo:
|
||||
def __init__(self, rf, nodes, racks, dcs):
|
||||
@@ -607,15 +559,7 @@ async def take_snapshot(ks, servers, manager, logger):
|
||||
|
||||
return snap_name,sstables
|
||||
|
||||
async def check_data_is_back(manager, logger, cql, ks, cf, keys, servers, topology, host_ids, scope, primary_replica_only, log_marks, different_min_tablet_count=False):
|
||||
logger.info(f'Check the data is back')
|
||||
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, scope, primary_replica_only)
|
||||
|
||||
if different_min_tablet_count:
|
||||
logger.info(f'Skipping streaming directions checks, we restored with a different min_tablet_count, so streaming is not predictable')
|
||||
return
|
||||
|
||||
async def check_streaming_directions(logger, servers, topology, host_ids, scope, primary_replica_only, log_marks):
|
||||
host_ids_per_dc = defaultdict(list)
|
||||
host_ids_per_dc_rack = dict()
|
||||
servers_by_host_id = dict()
|
||||
@@ -790,11 +734,12 @@ async def test_restore_with_streaming_scopes(build_mode: str, manager: ManagerCl
|
||||
cf = 'cf'
|
||||
|
||||
num_keys = 10
|
||||
original_min_tablet_count=5
|
||||
|
||||
scopes = ['rack', 'dc'] if build_mode == 'debug' else ['all', 'dc', 'rack', 'node']
|
||||
restored_min_tablet_counts = [5] if build_mode == 'debug' else [2, 5, 10]
|
||||
restored_min_tablet_counts = [original_min_tablet_count] if build_mode == 'debug' else [2, original_min_tablet_count, 10]
|
||||
|
||||
schema, keys, replication_opts = await create_dataset(manager, ks, cf, topology, logger, num_keys=num_keys, min_tablet_count=5)
|
||||
schema, keys, replication_opts = await create_dataset(manager, ks, cf, topology, logger, num_keys=num_keys, min_tablet_count=original_min_tablet_count)
|
||||
|
||||
# validate replicas assertions hold on fresh dataset
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, scope=None, primary_replica_only=False, expected_replicas = None)
|
||||
@@ -824,7 +769,9 @@ async def test_restore_with_streaming_scopes(build_mode: str, manager: ManagerCl
|
||||
|
||||
await do_load_sstables(ks, cf, servers, topology, sstables, scope, manager, logger, prefix=prefix, object_storage=object_storage, primary_replica_only=pro)
|
||||
|
||||
await check_data_is_back(manager, logger, cql, ks, cf, keys, servers, topology, host_ids, scope, primary_replica_only=pro, log_marks=log_marks, different_min_tablet_count=(restored_min_tablet_count != 512))
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, scope, primary_replica_only=pro)
|
||||
if restored_min_tablet_count == original_min_tablet_count:
|
||||
await check_streaming_directions(logger, servers, topology, host_ids, scope, pro, log_marks)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restore_with_non_existing_sstable(manager: ManagerClient, object_storage):
|
||||
@@ -866,21 +813,7 @@ async def test_backup_broken_streaming(manager: ManagerClient, s3_storage):
|
||||
|
||||
# Obtain the CQL interface from the manager.
|
||||
cql = manager.get_cql()
|
||||
|
||||
pid = local_process_id(cql)
|
||||
if not pid:
|
||||
pytest.skip("Can't find local Scylla process")
|
||||
# Now that we know the process id, use /proc to find the executable.
|
||||
try:
|
||||
scylla_path = os.readlink(f'/proc/{pid}/exe')
|
||||
except:
|
||||
pytest.skip("Can't find local Scylla executable")
|
||||
# Confirm that this executable is a real tool-providing Scylla by trying
|
||||
# to run it with the "--list-tools" option
|
||||
try:
|
||||
subprocess.check_output([scylla_path, '--list-tools'])
|
||||
except:
|
||||
pytest.skip("Local server isn't Scylla")
|
||||
scylla_path = await manager.server_get_exe(server.server_id)
|
||||
|
||||
async with new_test_keyspace(manager,
|
||||
"WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as keyspace:
|
||||
@@ -946,14 +879,16 @@ async def test_backup_broken_streaming(manager: ManagerClient, s3_storage):
|
||||
await log.wait_for("partially contained SSTables", timeout=10)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restore_primary_replica_same_rack_scope_rack(manager: ManagerClient, object_storage):
|
||||
'''Check that restoring with primary_replica_only and scope rack streams only to primary replica in the same rack.
|
||||
The test checks that each mutation exists exactly 2 times within the cluster, once in each rack
|
||||
(each restoring node streams to one primary replica in its rack. Without primary_replica_only we'd see 4 replicas, 2 in each rack).
|
||||
The test also checks that the logs of each restoring node shows streaming to a single node, which is the primary replica within the same rack.'''
|
||||
@pytest.mark.parametrize("domain", ['rack', 'dc'])
|
||||
async def test_restore_primary_replica_same_domain(manager: ManagerClient, object_storage, domain):
|
||||
'''Check that restoring with primary_replica_only and domain scope streams only to primary replica in the same domain.
|
||||
The test checks that each mutation exists exactly 2 times within the cluster, once in each domain
|
||||
(each restoring node streams to one primary replica in its domain. Without primary_replica_only we'd see 4 replicas, 2 in each domain).
|
||||
The test also checks that the logs of each restoring node shows streaming to a single node, which is the primary replica within the same domain.'''
|
||||
|
||||
topology = topo(rf = 4, nodes = 8, racks = 2, dcs = 1)
|
||||
scope = "rack"
|
||||
dcs = 1 if domain == 'rack' else 2
|
||||
topology = topo(rf = 4, nodes = 8, racks = 2, dcs = dcs)
|
||||
scope = domain
|
||||
ks = 'ks'
|
||||
cf = 'cf'
|
||||
|
||||
@@ -989,27 +924,37 @@ async def test_restore_primary_replica_same_rack_scope_rack(manager: ManagerClie
|
||||
for r in res:
|
||||
nodes_by_operation[r[1].group(1)].append(r[1].group(2))
|
||||
|
||||
scope_nodes = set([ str(host_ids[s.server_id]) for s in servers if s.rack == servers[i].rack ])
|
||||
def same_domain(s1, s2):
|
||||
if domain == 'rack':
|
||||
return s1.rack == s2.rack
|
||||
else:
|
||||
return s1.datacenter == s2.datacenter
|
||||
|
||||
scope_nodes = set([ str(host_ids[s.server_id]) for s in servers if same_domain(s, servers[i]) ])
|
||||
for op, nodes in nodes_by_operation.items():
|
||||
logger.info(f'Operation {op} streamed to nodes {nodes}')
|
||||
assert len(nodes) == 1, "Each streaming operation should stream to exactly one primary replica"
|
||||
assert nodes[0] in scope_nodes, f"Primary replica should be within the scope {scope}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restore_primary_replica_different_rack_scope_dc(manager: ManagerClient, object_storage):
|
||||
'''Check that restoring with primary_replica_only and scope dc permits cross-rack streaming.
|
||||
The test checks that each mutation exists exactly 1 time within the cluster, in one of the racks.
|
||||
(each restoring node would pick the same primary replica, one would pick it within its own rack(itself), one would pick it from the other rack.
|
||||
Without primary_replica_only we'd see 2 replicas, 1 in each rack).
|
||||
The test also checks that the logs of each restoring node shows streaming to two nodes because cross-rack streaming is allowed
|
||||
@pytest.mark.parametrize("domain", ['rack', 'dc'])
|
||||
async def test_restore_primary_replica_different_domain(manager: ManagerClient, object_storage, domain):
|
||||
'''Check that restoring with primary_replica_only and wider scope permits cross-domain streaming.
|
||||
The test checks that each mutation exists exactly 1 time within the cluster, in one of the domains.
|
||||
(each restoring node would pick the same primary replica, one would pick it within its own domain(itself), one would pick it from the other domain.
|
||||
Without primary_replica_only we'd see 2 replicas, 1 in each domain).
|
||||
The test also checks that the logs of each restoring node shows streaming to two nodes because cross-domain streaming is allowed
|
||||
and eventually one node, depending on tablet_id of mutations, will end up choosing either of the two nodes as primary replica.'''
|
||||
|
||||
topology = topo(rf = 2, nodes = 2, racks = 2, dcs = 1)
|
||||
scope = "dc"
|
||||
dcs = 1 if domain == 'rack' else 2
|
||||
racks = 2 if domain == 'rack' else 1
|
||||
rf = 2 if domain == 'rack' else 1
|
||||
topology = topo(rf = rf, nodes = 2, racks = racks, dcs = dcs)
|
||||
scope = "dc" if domain == 'rack' else "all"
|
||||
ks = 'ks'
|
||||
cf = 'cf'
|
||||
|
||||
servers, host_ids = await create_cluster(topology, True, manager, logger, object_storage)
|
||||
servers, host_ids = await create_cluster(topology, True if domain == 'rack' else False, manager, logger, object_storage)
|
||||
|
||||
await manager.disable_tablet_balancing()
|
||||
cql = manager.get_cql()
|
||||
@@ -1040,101 +985,3 @@ async def test_restore_primary_replica_different_rack_scope_dc(manager: ManagerC
|
||||
streamed_to = set([ r[1].group(1) for r in res ])
|
||||
logger.info(f'{s.ip_addr} {host_ids[s.server_id]} streamed to {streamed_to}')
|
||||
assert len(streamed_to) == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restore_primary_replica_same_dc_scope_dc(manager: ManagerClient, object_storage):
|
||||
'''Check that restoring with primary_replica_only and scope dc streams only to primary replica in the local dc.
|
||||
The test checks that each mutation exists exactly 2 times within the cluster, once in each dc
|
||||
(each restoring node streams to one primary replica in its dc. Without primary_replica_only we'd see 4 replicas, 2 in each dc).
|
||||
The test also checks that the logs of each restoring node shows streaming to a single node, which is the primary replica within the same dc.'''
|
||||
|
||||
topology = topo(rf = 4, nodes = 8, racks = 2, dcs = 2)
|
||||
scope = "dc"
|
||||
ks = 'ks'
|
||||
cf = 'cf'
|
||||
|
||||
servers, host_ids = await create_cluster(topology, False, manager, logger, object_storage)
|
||||
|
||||
await manager.disable_tablet_balancing()
|
||||
cql = manager.get_cql()
|
||||
|
||||
schema, keys, replication_opts = await create_dataset(manager, ks, cf, topology, logger)
|
||||
|
||||
# validate replicas assertions hold on fresh dataset
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf)
|
||||
|
||||
snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
|
||||
prefix = f'{cf}/{snap_name}'
|
||||
|
||||
await asyncio.gather(*(do_backup(s, snap_name, prefix, ks, cf, object_storage, manager, logger) for s in servers))
|
||||
|
||||
logger.info(f'Re-initialize keyspace')
|
||||
cql.execute(f'DROP KEYSPACE {ks}')
|
||||
cql.execute((f"CREATE KEYSPACE {ks} WITH REPLICATION = {replication_opts};"))
|
||||
cql.execute(schema)
|
||||
|
||||
await asyncio.gather(*(do_restore_server(manager, logger, ks, cf, s, sstables[s], scope, True, prefix, object_storage) for s in servers))
|
||||
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, scope, primary_replica_only=True, expected_replicas=2)
|
||||
|
||||
logger.info(f'Validate streaming directions')
|
||||
for i, s in enumerate(servers):
|
||||
log = await manager.server_open_log(s.server_id)
|
||||
res = await log.grep(r'INFO.*sstables_loader - load_and_stream: ops_uuid=([0-9a-z-]+).*target_node=([0-9a-z-]+),.*num_bytes_sent=([0-9]+)')
|
||||
nodes_by_operation = defaultdict(list)
|
||||
for r in res:
|
||||
nodes_by_operation[r[1].group(1)].append(r[1].group(2))
|
||||
|
||||
scope_nodes = set([ str(host_ids[s.server_id]) for s in servers if s.datacenter == servers[i].datacenter ])
|
||||
for op, nodes in nodes_by_operation.items():
|
||||
logger.info(f'Operation {op} streamed to nodes {nodes}')
|
||||
assert len(nodes) == 1, "Each streaming operation should stream to exactly one primary replica"
|
||||
assert nodes[0] in scope_nodes, f"Primary replica should be within the scope {scope}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restore_primary_replica_different_dc_scope_all(manager: ManagerClient, object_storage):
|
||||
'''Check that restoring with primary_replica_only and scope all permits cross-dc streaming.
|
||||
The test checks that each mutation exists exactly 1 time within the cluster, in only one of the dcs.
|
||||
(each restoring node would pick the same primary replica, one would pick it within its own dc(itself), one would pick it from the other dc.
|
||||
Without primary_replica_only, we'd see 2 replicas, 1 in each dc).
|
||||
The test also checks that the logs of each restoring node shows streaming to two nodes because cross-dc streaming is allowed
|
||||
and eventually one node, depending on tablet_id of mutations, will end up choosing either of the two nodes as primary replica.'''
|
||||
|
||||
topology = topo(rf = 1, nodes = 2, racks = 1, dcs = 2)
|
||||
scope = "all"
|
||||
ks = 'ks'
|
||||
cf = 'cf'
|
||||
|
||||
servers, host_ids = await create_cluster(topology, False, manager, logger, object_storage)
|
||||
|
||||
await manager.disable_tablet_balancing()
|
||||
cql = manager.get_cql()
|
||||
|
||||
schema, keys, replication_opts = await create_dataset(manager, ks, cf, topology, logger)
|
||||
|
||||
# validate replicas assertions hold on fresh dataset
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, expected_replicas=2)
|
||||
|
||||
snap_name, sstables = await take_snapshot(ks, servers, manager, logger)
|
||||
prefix = f'{cf}/{snap_name}'
|
||||
|
||||
await asyncio.gather(*(do_backup(s, snap_name, prefix, ks, cf, object_storage, manager, logger) for s in servers))
|
||||
|
||||
logger.info(f'Re-initialize keyspace')
|
||||
cql.execute(f'DROP KEYSPACE {ks}')
|
||||
cql.execute((f"CREATE KEYSPACE {ks} WITH REPLICATION = {replication_opts};"))
|
||||
cql.execute(schema)
|
||||
|
||||
r_servers = servers
|
||||
|
||||
await asyncio.gather(*(do_restore_server(manager, logger, ks, cf, s, sstables[s], scope, True, prefix, object_storage) for s in r_servers))
|
||||
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, scope, primary_replica_only=True, expected_replicas=1)
|
||||
|
||||
logger.info(f'Validate streaming directions')
|
||||
for i, s in enumerate(r_servers):
|
||||
log = await manager.server_open_log(s.server_id)
|
||||
res = await log.grep(r'INFO.*sstables_loader - load_and_stream:.*target_node=([0-9a-z-]+),.*num_bytes_sent=([0-9]+)')
|
||||
streamed_to = set([ r[1].group(1) for r in res ])
|
||||
logger.info(f'{s.ip_addr} {host_ids[s.server_id]} streamed to {streamed_to}, expected {r_servers}')
|
||||
assert len(streamed_to) == 2
|
||||
|
||||
@@ -11,37 +11,14 @@ from test.pylib.minio_server import MinioServer
|
||||
from cassandra.protocol import ConfigurationException
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.util import reconnect_driver
|
||||
from test.cluster.object_store.conftest import format_tuples
|
||||
from test.cluster.object_store.conftest import format_tuples, keyspace_options
|
||||
from test.cqlpy.rest_api import scylla_inject_error
|
||||
from test.cluster.test_config import wait_for_config
|
||||
from test.cluster.util import new_test_keyspace
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_ks_and_cf(cql, object_storage):
|
||||
ks = 'test_ks'
|
||||
cf = 'test_cf'
|
||||
|
||||
replication_opts = format_tuples({'class': 'NetworkTopologyStrategy',
|
||||
'replication_factor': '1'})
|
||||
storage_opts = format_tuples(type=f'{object_storage.type}',
|
||||
endpoint=object_storage.address,
|
||||
bucket=object_storage.bucket_name)
|
||||
|
||||
cql.execute((f"CREATE KEYSPACE {ks} WITH"
|
||||
f" REPLICATION = {replication_opts} AND STORAGE = {storage_opts};"))
|
||||
cql.execute(f"CREATE TABLE {ks}.{cf} ( name text primary key, value text );")
|
||||
|
||||
rows = [('0', 'zero'),
|
||||
('1', 'one'),
|
||||
('2', 'two')]
|
||||
for row in rows:
|
||||
cql_fmt = "INSERT INTO {}.{} ( name, value ) VALUES ('{}', '{}');"
|
||||
cql.execute(cql_fmt.format(ks, cf, *row))
|
||||
|
||||
return ks, cf
|
||||
|
||||
|
||||
@pytest.mark.parametrize('mode', ['normal', 'encrypted'])
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic(manager: ManagerClient, object_storage, tmp_path, mode):
|
||||
@@ -63,48 +40,50 @@ async def test_basic(manager: ManagerClient, object_storage, tmp_path, mode):
|
||||
cql = manager.get_cql()
|
||||
workdir = await manager.server_get_workdir(server.server_id)
|
||||
print(f'Create keyspace (storage server listening at {object_storage.address})')
|
||||
ks, cf = create_ks_and_cf(cql, object_storage)
|
||||
async with new_test_keyspace(manager, keyspace_options(object_storage)) as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (name text PRIMARY KEY, value int);")
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (name, value) VALUES ('{k}', {k});") for k in range(4)])
|
||||
|
||||
assert not os.path.exists(os.path.join(workdir, f'data/{ks}')), "object storage backed keyspace has local directory created"
|
||||
# Sanity check that the path is constructed correctly
|
||||
assert os.path.exists(os.path.join(workdir, 'data/system')), "Datadir is elsewhere"
|
||||
assert not os.path.exists(os.path.join(workdir, f'data/{ks}')), "object storage backed keyspace has local directory created"
|
||||
# Sanity check that the path is constructed correctly
|
||||
assert os.path.exists(os.path.join(workdir, 'data/system')), "Datadir is elsewhere"
|
||||
|
||||
desc = cql.execute(f"DESCRIBE KEYSPACE {ks}").one().create_statement
|
||||
# The storage_opts wraps options with '{ <options> }' while the DESCRIBE
|
||||
# does it like '{<options>}' so strip the corner branches and spaces for check
|
||||
assert f"{{'type': '{object_storage.type}', 'bucket': '{object_storage.bucket_name}', 'endpoint': '{object_storage.address}'}}" in desc, "DESCRIBE generates unexpected storage options"
|
||||
desc = cql.execute(f"DESCRIBE KEYSPACE {ks}").one().create_statement
|
||||
# The storage_opts wraps options with '{ <options> }' while the DESCRIBE
|
||||
# does it like '{<options>}' so strip the corner branches and spaces for check
|
||||
assert f"{{'type': '{object_storage.type}', 'bucket': '{object_storage.bucket_name}', 'endpoint': '{object_storage.address}'}}" in desc, "DESCRIBE generates unexpected storage options"
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
rows = {x.name: x.value for x in res}
|
||||
assert len(rows) > 0, 'Test table is empty'
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
rows = {x.name: x.value for x in res}
|
||||
assert len(rows) > 0, 'Test table is empty'
|
||||
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
|
||||
# Check that the ownership table is populated properly
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
tid = cql.execute(f"SELECT id FROM system_schema.tables WHERE keyspace_name = '{ks}' AND table_name = '{cf}'").one()
|
||||
for row in res:
|
||||
assert row.owner == tid.id, \
|
||||
f'Unexpected entry owner in registry: {row.owner}'
|
||||
assert row.status == 'sealed', f'Unexpected entry status in registry: {row.status}'
|
||||
# Check that the ownership table is populated properly
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
tid = cql.execute(f"SELECT id FROM system_schema.tables WHERE keyspace_name = '{ks}' AND table_name = 'test'").one()
|
||||
for row in res:
|
||||
assert row.owner == tid.id, \
|
||||
f'Unexpected entry owner in registry: {row.owner}'
|
||||
assert row.status == 'sealed', f'Unexpected entry status in registry: {row.status}'
|
||||
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
|
||||
# Shouldn't be recreated by populator code
|
||||
assert not os.path.exists(os.path.join(workdir, f'data/{ks}')), "object storage backed keyspace has local directory resurrected"
|
||||
# Shouldn't be recreated by populator code
|
||||
assert not os.path.exists(os.path.join(workdir, f'data/{ks}')), "object storage backed keyspace has local directory resurrected"
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
have_res = {x.name: x.value for x in res}
|
||||
assert have_res == rows, f'Unexpected table content: {have_res}'
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
have_res = {x.name: x.value for x in res}
|
||||
assert have_res == rows, f'Unexpected table content: {have_res}'
|
||||
|
||||
print('Drop table')
|
||||
cql.execute(f"DROP TABLE {ks}.{cf};")
|
||||
# Check that the ownership table is de-populated
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
rows = "\n".join(f"{row.owner} {row.status}" for row in res)
|
||||
assert not rows, 'Unexpected entries in registry'
|
||||
print('Drop table')
|
||||
cql.execute(f"DROP TABLE {ks}.test;")
|
||||
# Check that the ownership table is de-populated
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
rows = "\n".join(f"{row.owner} {row.status}" for row in res)
|
||||
assert not rows, 'Unexpected entries in registry'
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_garbage_collect(manager: ManagerClient, object_storage):
|
||||
@@ -116,37 +95,40 @@ async def test_garbage_collect(manager: ManagerClient, object_storage):
|
||||
cfg = {'enable_user_defined_functions': False,
|
||||
'object_storage_endpoints': objconf,
|
||||
'experimental_features': ['keyspace-storage-options']}
|
||||
server = await manager.server_add(config=cfg)
|
||||
cmd = ['--logger-log-level', 's3=trace:http=debug:gcp_storage=trace']
|
||||
server = await manager.server_add(config=cfg, cmdline=cmd)
|
||||
|
||||
cql = manager.get_cql()
|
||||
|
||||
print(f'Create keyspace (storage server listening at {object_storage.address})')
|
||||
ks, cf = create_ks_and_cf(cql, object_storage)
|
||||
async with new_test_keyspace(manager, keyspace_options(object_storage)) as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (name text PRIMARY KEY, value int);")
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (name, value) VALUES ('{k}', {k});") for k in range(4)])
|
||||
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
# Mark the sstables as "removing" to simulate the problem
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
for row in res:
|
||||
sstable_entries.append((row.owner, row.generation))
|
||||
print(f'Found entries: {[ str(ent[1]) for ent in sstable_entries ]}')
|
||||
for owner, gen in sstable_entries:
|
||||
cql.execute("UPDATE system.sstables SET status = 'removing'"
|
||||
f" WHERE owner = {owner} AND generation = {gen};")
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
# Mark the sstables as "removing" to simulate the problem
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
for row in res:
|
||||
sstable_entries.append((row.owner, row.generation))
|
||||
print(f'Found entries: {[ str(ent[1]) for ent in sstable_entries ]}')
|
||||
for owner, gen in sstable_entries:
|
||||
cql.execute("UPDATE system.sstables SET status = 'removing'"
|
||||
f" WHERE owner = {owner} AND generation = {gen};")
|
||||
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
have_res = {x.name: x.value for x in res}
|
||||
# Must be empty as no sstables should have been picked up
|
||||
assert not have_res, f'Sstables not cleaned, got {have_res}'
|
||||
# Make sure objects also disappeared
|
||||
objects = object_storage.get_resource().Bucket(object_storage.bucket_name).objects.all()
|
||||
print(f'Found objects: {[ objects ]}')
|
||||
for o in objects:
|
||||
for ent in sstable_entries:
|
||||
assert not o.key.startswith(str(ent[1])), f'Sstable object not cleaned, found {o.key}'
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
have_res = {x.name: x.value for x in res}
|
||||
# Must be empty as no sstables should have been picked up
|
||||
assert not have_res, f'Sstables not cleaned, got {have_res}'
|
||||
# Make sure objects also disappeared
|
||||
objects = object_storage.get_resource().Bucket(object_storage.bucket_name).objects.all()
|
||||
print(f'Found objects: {[ objects ]}')
|
||||
for o in objects:
|
||||
for ent in sstable_entries:
|
||||
assert not o.key.startswith(str(ent[1])), f'Sstable object not cleaned, found {o.key}'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -162,28 +144,30 @@ async def test_populate_from_quarantine(manager: ManagerClient, object_storage):
|
||||
cql = manager.get_cql()
|
||||
|
||||
print(f'Create keyspace (storage server listening at {object_storage.address})')
|
||||
ks, cf = create_ks_and_cf(cql, object_storage)
|
||||
async with new_test_keyspace(manager, keyspace_options(object_storage)) as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (name text PRIMARY KEY, value int);")
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (name, value) VALUES ('{k}', {k});") for k in range(4)])
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
rows = {x.name: x.value for x in res}
|
||||
assert len(rows) > 0, 'Test table is empty'
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
rows = {x.name: x.value for x in res}
|
||||
assert len(rows) > 0, 'Test table is empty'
|
||||
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
# Move the sstables into "quarantine"
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
assert len(list(res)) > 0, 'No entries in registry'
|
||||
for row in res:
|
||||
cql.execute("UPDATE system.sstables SET state = 'quarantine'"
|
||||
f" WHERE owner = {row.owner} AND generation = {row.generation};")
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
# Move the sstables into "quarantine"
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
assert len(list(res)) > 0, 'No entries in registry'
|
||||
for row in res:
|
||||
cql.execute("UPDATE system.sstables SET state = 'quarantine'"
|
||||
f" WHERE owner = {row.owner} AND generation = {row.generation};")
|
||||
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
have_res = {x.name: x.value for x in res}
|
||||
# Quarantine entries must have been processed normally
|
||||
assert have_res == rows, f'Unexpected table content: {have_res}'
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
have_res = {x.name: x.value for x in res}
|
||||
# Quarantine entries must have been processed normally
|
||||
assert have_res == rows, f'Unexpected table content: {have_res}'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -224,31 +208,34 @@ async def test_memtable_flush_retries(manager: ManagerClient, tmpdir, object_sto
|
||||
cql = manager.get_cql()
|
||||
print(f'Create keyspace (storage server listening at {object_storage.address})')
|
||||
|
||||
ks, cf = create_ks_and_cf(cql, object_storage)
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
rows = {x.name: x.value for x in res}
|
||||
async with new_test_keyspace(manager, keyspace_options(object_storage)) as ks:
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (name text PRIMARY KEY, value int);")
|
||||
await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (name, value) VALUES ('{k}', {k});") for k in range(4)])
|
||||
|
||||
with scylla_inject_error(cql, "s3_client_fail_authorization"):
|
||||
print(f'Flush keyspace')
|
||||
flush = asyncio.create_task(manager.api.flush_keyspace(server.ip_addr, ks))
|
||||
print(f'Wait few seconds')
|
||||
await asyncio.sleep(8)
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
rows = {x.name: x.value for x in res}
|
||||
|
||||
print(f'Wait for flush to finish')
|
||||
await flush
|
||||
with scylla_inject_error(cql, "s3_client_fail_authorization"):
|
||||
print(f'Flush keyspace')
|
||||
flush = asyncio.create_task(manager.api.flush_keyspace(server.ip_addr, ks))
|
||||
print(f'Wait few seconds')
|
||||
await asyncio.sleep(8)
|
||||
|
||||
print(f'Check the sstables table')
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
ssts = "\n".join(f"{row.owner} {row.generation} {row.status}" for row in res)
|
||||
print(f'sstables:\n{ssts}')
|
||||
print(f'Wait for flush to finish')
|
||||
await flush
|
||||
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
print(f'Check the sstables table')
|
||||
res = cql.execute("SELECT * FROM system.sstables;")
|
||||
ssts = "\n".join(f"{row.owner} {row.generation} {row.status}" for row in res)
|
||||
print(f'sstables:\n{ssts}')
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.{cf};")
|
||||
have_res = { x.name: x.value for x in res }
|
||||
assert have_res == dict(rows), f'Unexpected table content: {have_res}'
|
||||
print('Restart scylla')
|
||||
await manager.server_restart(server.server_id)
|
||||
cql = await reconnect_driver(manager)
|
||||
|
||||
res = cql.execute(f"SELECT * FROM {ks}.test;")
|
||||
have_res = { x.name: x.value for x in res }
|
||||
assert have_res == dict(rows), f'Unexpected table content: {have_res}'
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize('config_with_full_url', [True, False])
|
||||
|
||||
@@ -15,8 +15,8 @@ from typing import Callable
|
||||
from contextlib import asynccontextmanager, contextmanager
|
||||
from dataclasses import dataclass
|
||||
|
||||
from test.cluster.conftest import PHASE_REPORT_KEY
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.conftest import *
|
||||
from test.pylib.util import gather_safely
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ from test.cluster.util import get_topology_coordinator, find_server_by_host_id,
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.tablets import get_tablet_count
|
||||
from test.pylib.util import Host
|
||||
from test.storage.conftest import space_limited_servers
|
||||
from test.cluster.storage.conftest import space_limited_servers
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -47,6 +47,5 @@ run_in_dev:
|
||||
- dtest/commitlog_test
|
||||
- dtest/cfid_test
|
||||
- dtest/rebuild_test
|
||||
- dtest/guardrails_test
|
||||
run_in_debug:
|
||||
- random_failures/test_random_failures
|
||||
|
||||
@@ -3,14 +3,18 @@
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
from time import time
|
||||
|
||||
from test.cluster.tasks.task_manager_types import TaskID, TaskStats, TaskStatus
|
||||
from test.cluster.tasks.task_manager_types import TaskID, TaskStats, TaskStatus, State
|
||||
from test.pylib.internal_types import IPAddress
|
||||
from test.pylib.rest_client import ScyllaRESTAPIClient
|
||||
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
from test.pylib.util import wait_for
|
||||
|
||||
|
||||
class TaskManagerClient():
|
||||
"""Async Task Manager client"""
|
||||
|
||||
@@ -36,6 +40,27 @@ class TaskManagerClient():
|
||||
assert(type(stats_list) == list)
|
||||
return [TaskStats(**stats_dict) for stats_dict in stats_list]
|
||||
|
||||
async def wait_task_appears(self, node_ip: IPAddress, module_name: str,
|
||||
task_type: Optional[str] = None,
|
||||
entity: Optional[str] = None,
|
||||
deadline: Optional[float] = None) -> TaskStats:
|
||||
"""
|
||||
Waits for a task to appear in "running" state based on the specified task filter.
|
||||
A task matches the filter if all of its fields mach the specified attributes.
|
||||
Throws an exception if no such task appears before the deadline.
|
||||
|
||||
:return: stats of the first task matching the filter.
|
||||
"""
|
||||
async def get_tasks():
|
||||
tasks = await self.list_tasks(node_ip, module_name)
|
||||
for stats in tasks:
|
||||
if stats.state == State.running and \
|
||||
(task_type is None or stats.type == task_type) and \
|
||||
(entity is None or stats.entity == entity):
|
||||
return stats
|
||||
return None
|
||||
return await wait_for(get_tasks, deadline or (time() + 60), period=0.1, backoff_factor=1.2, max_period=1)
|
||||
|
||||
async def get_task_status(self, node_ip: IPAddress, task_id: TaskID) -> TaskStatus:
|
||||
"""Get status of one task."""
|
||||
status = await self.api.client.get_json(f"/task_manager/task_status/{task_id}", host=node_ip)
|
||||
|
||||
@@ -183,6 +183,72 @@ async def test_alternator_ttl_scheduling_group(manager: ManagerClient):
|
||||
|
||||
table.delete()
|
||||
|
||||
@pytest.mark.parametrize("with_down_node", [False, True], ids=["all_nodes_up", "one_node_down"])
|
||||
async def test_alternator_ttl_multinode_expiration(manager: ManagerClient, with_down_node):
|
||||
"""When the cluster has multiple nodes, different nodes are responsible
|
||||
for checking expiration in different token ranges - each node is
|
||||
responsible for its "primary ranges". Let's check that this expiration
|
||||
really does happen - for the entire token range - by writing many
|
||||
partitions that will span the entire token range, and seeing that they
|
||||
all expire. We don't check that nodes don't do more work than they
|
||||
should - an inefficient implementation where every node scans the
|
||||
entire data set will also pass this test.
|
||||
When the test is run a second time with with_down_node=True, we verify
|
||||
that TTL expiration works correctly even when one of the nodes is
|
||||
brought down. This node's TTL scanner is responsible for scanning part
|
||||
of the token range, so when this node is down, part of the data might
|
||||
not get expired. At that point - other node(s) should take over
|
||||
expiring data in that range - and this test verifies that this indeed
|
||||
happens. Reproduces issue #9787 and SCYLLADB-777.
|
||||
"""
|
||||
servers = await manager.servers_add(3, config=alternator_config, auto_rack_dc='dc1')
|
||||
alternator = get_alternator(servers[0].ip_addr)
|
||||
|
||||
if with_down_node:
|
||||
# Bring down one of nodes. Everything we do below, like creating a
|
||||
# table, reading and writing, should continue to work with one node
|
||||
# down.
|
||||
await manager.server_stop_gracefully(servers[2].server_id)
|
||||
|
||||
table = alternator.create_table(TableName=unique_table_name(),
|
||||
BillingMode='PAY_PER_REQUEST',
|
||||
KeySchema=[
|
||||
{'AttributeName': 'p', 'KeyType': 'HASH' },
|
||||
],
|
||||
AttributeDefinitions=[
|
||||
{'AttributeName': 'p', 'AttributeType': 'N' },
|
||||
])
|
||||
# Set the "expiration" column to mark item's expiration time
|
||||
table.meta.client.update_time_to_live(TableName=table.name, TimeToLiveSpecification={'AttributeName': 'expiration', 'Enabled': True})
|
||||
|
||||
# Insert 50 rows, in different partitions, so the murmur3 hash maps them
|
||||
# all over the token space so different nodes would be responsible for
|
||||
# expiring them. All items are marked to expire 10 seconds in the past,
|
||||
# so should all expire as soon as possible, during this test.
|
||||
expiration = int(time.time()) - 10
|
||||
with table.batch_writer() as batch:
|
||||
for p in range(50):
|
||||
batch.put_item({'p': p, 'expiration': expiration})
|
||||
# Expect that after a short delay, all items in the table will have
|
||||
# expired - so a scan should return no responses. This should happen
|
||||
# even though one of the nodes is down and not doing its usual
|
||||
# expiration-scanning work.
|
||||
timeout = time.time() + 60
|
||||
items = -1
|
||||
while items != 0 and time.time() < timeout:
|
||||
response = table.scan(ConsistentRead=True)
|
||||
items = len(response['Items'])
|
||||
# In theory (though probably not in practice in this test), a scan()
|
||||
# can return zero items but have more pages - so we need to be more
|
||||
# diligent and scan all pages to check it's completely empty.
|
||||
while items == 0 and 'LastEvaluatedKey' in response:
|
||||
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'], ConsistentRead=True)
|
||||
items += len(response['Items'])
|
||||
if items == 0:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
assert items == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_localnodes_broadcast_rpc_address(manager: ManagerClient):
|
||||
"""Test that if the "broadcast_rpc_address" of a node is set, the
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.util import new_test_keyspace
|
||||
from test.cluster.util import new_test_keyspace, get_topology_version
|
||||
from cassandra import WriteFailure
|
||||
import pytest
|
||||
import logging
|
||||
@@ -132,10 +132,7 @@ async def test_cleanup_waits_for_stale_writes(manager: ManagerClient):
|
||||
logger.info("Trigger topology_coordinator/write_both_read_new/after_barrier")
|
||||
await manager.api.message_injection(servers[0].ip_addr, "topology_coordinator/write_both_read_new/after_barrier")
|
||||
await bootstrap_task
|
||||
rows = await cql.run_async(
|
||||
"select version from system.topology where key = 'topology'",
|
||||
host=hosts[0])
|
||||
version_after_node2_bootstrap = rows[0].version
|
||||
version_after_node2_bootstrap = await get_topology_version(cql, hosts[0])
|
||||
host1_id = await manager.get_host_id(servers[1].server_id)
|
||||
|
||||
# Have a cleanup started by decommission and failed on global barrier wait for the stale write
|
||||
|
||||
@@ -14,11 +14,5 @@ async def test_config_live_updates(manager):
|
||||
server = await manager.server_add(config=config)
|
||||
server_log = await manager.server_open_log(server.server_id)
|
||||
|
||||
await manager.server_update_config(server.server_id, "permissions_validity_in_ms", 20000)
|
||||
await server_log.wait_for("Updating loading cache; max_size: 1000, expiry: 20000ms, refresh: 100ms")
|
||||
|
||||
await manager.server_update_config(server.server_id, "permissions_update_interval_in_ms", 30000)
|
||||
await server_log.wait_for("Updating loading cache; max_size: 1000, expiry: 20000ms, refresh: 30000ms")
|
||||
|
||||
await manager.server_update_config(server.server_id, "uninitialized_connections_semaphore_cpu_concurrency", 16)
|
||||
await server_log.wait_for("Updating uninitialized_connections_semaphore_cpu_concurrency from 8 to 16 due to config update")
|
||||
|
||||
@@ -31,5 +31,14 @@ async def test_different_group0_ids(manager: ManagerClient):
|
||||
await manager.server_stop(scylla_b.server_id)
|
||||
await manager.server_start(scylla_b.server_id, seeds=[scylla_a.ip_addr])
|
||||
|
||||
# Since scylla_a and scylla_b have different group0 IDs and didn't join each other,
|
||||
# they are separate clusters. We need to set audit keyspace RF=0 on scylla_b (the node
|
||||
# being decommissioned) to prevent its audit replicas from interfering with the expected
|
||||
# "zero replica after the removal" error from the repair service.
|
||||
cql_b = await manager.get_cql_exclusive(scylla_b)
|
||||
result_b = await cql_b.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'audit'")
|
||||
if result_b:
|
||||
await cql_b.run_async("DROP KEYSPACE audit")
|
||||
|
||||
log_file_a = await manager.server_open_log(scylla_a.server_id)
|
||||
await log_file_a.wait_for(f'Group0Id mismatch from {id_b}', timeout=30)
|
||||
|
||||
@@ -9,10 +9,9 @@ from test.pylib.util import unique_name, wait_for_cql_and_get_hosts, wait_for
|
||||
from cassandra import WriteFailure, ConsistencyLevel
|
||||
from test.pylib.internal_types import ServerInfo
|
||||
from test.pylib.rest_client import ScyllaMetrics
|
||||
from test.pylib.tablets import get_all_tablet_replicas
|
||||
from cassandra.pool import Host # type: ignore # pylint: disable=no-name-in-module
|
||||
from cassandra.query import SimpleStatement
|
||||
from test.cluster.util import new_test_keyspace, reconnect_driver
|
||||
from test.cluster.util import new_test_keyspace, get_topology_version
|
||||
from test.pylib.scylla_cluster import ScyllaVersionDescription
|
||||
import pytest
|
||||
import logging
|
||||
@@ -44,13 +43,6 @@ async def set_fence_version(manager: ManagerClient, host: Host, new_version: int
|
||||
host=host)
|
||||
|
||||
|
||||
async def get_version(manager: ManagerClient, host: Host):
|
||||
rows = await manager.cql.run_async(
|
||||
"select version from system.topology where key = 'topology'",
|
||||
host=host)
|
||||
return rows[0].version
|
||||
|
||||
|
||||
def send_errors_metric(metrics: ScyllaMetrics):
|
||||
return metrics.get('scylla_hints_manager_send_errors')
|
||||
|
||||
@@ -107,7 +99,7 @@ async def test_fence_writes(request, manager: ManagerClient, tablets_enabled: bo
|
||||
# causing topology_state_load on it to see the decremented version and report broken invariants.
|
||||
await manager.api.cleanup_all(servers[2].ip_addr)
|
||||
|
||||
version = await get_version(manager, host2)
|
||||
version = await get_topology_version(cql, host2)
|
||||
logger.info(f"version on host2 {version}")
|
||||
|
||||
await set_version(manager, host2, version - 1)
|
||||
@@ -165,7 +157,7 @@ async def test_fence_hints(request, manager: ManagerClient):
|
||||
hosts = await wait_for_cql_and_get_hosts(cql, [s0, s2], time.time() + 60)
|
||||
|
||||
host2 = host_by_server(hosts, s2)
|
||||
new_version = (await get_version(manager, host2)) + 1
|
||||
new_version = (await get_topology_version(cql, host2)) + 1
|
||||
logger.info(f"Set version and fence_version to {new_version} on node {host2}")
|
||||
await set_version(manager, host2, new_version)
|
||||
await set_fence_version(manager, host2, new_version)
|
||||
@@ -362,113 +354,6 @@ async def test_fence_lwt_during_bootstap(manager: ManagerClient):
|
||||
assert row.c == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_fenced_out_on_tablet_migration_while_handling_paxos_verb(manager: ManagerClient):
|
||||
"""
|
||||
This test verifies that the fencing token is checked on replicas
|
||||
after the local Paxos state is updated. This ensures that if we failed
|
||||
to drain an LWT request during topology changes the replicas
|
||||
where paxos verbs got stuck won't contributed to the target CLs.
|
||||
|
||||
Scenario:
|
||||
1. Set up a three-node cluster:
|
||||
- n1 (rack1) is the topology coordinator.
|
||||
- The table has a single tablet with RF=2, and replicas on n2 (rack2) and n3 (rack1).
|
||||
- n3 will act as an LWT coordinator that fails `barrier_and_drain`.
|
||||
- A test tablet migration will proceed, incrementing both `version` and `fence_version`
|
||||
on all nodes, including n2. This will cause accept on n2 to be fenced out when
|
||||
it eventually gets unstuck.
|
||||
2. Inject `paxos_accept_proposal_wait` on n2 — we need to suspend the accept on n2.
|
||||
3. Run an LWT on n3 and wait until it hits the injection on n2.
|
||||
4. Inject `raft_topology_barrier_and_drain_fail_before` on n2 to simulate
|
||||
an intermittent network failure. This causes `barrier_and_drain` to fail,
|
||||
but `global_token_metadata_barrier` still succeeds because
|
||||
`raft_topology_cmd::command::barrier` delivers the new `fence_version`
|
||||
to all replicas, including n2.
|
||||
Note: `global_token_metadata_barrier` is called multiple times during tablet migration.
|
||||
Since `stale_versions_in_use` on replicas waits for *all* previous versions of
|
||||
`token_metadata` to be dropped, we must use `enable_injection(one_shot=False)` so that
|
||||
all `barrier_and_drain` calls on n2 fail.
|
||||
5. Migrate the tablet replica from n3 to n1. The migration must succeed
|
||||
even with an unfinished LWT holding an old `erm` version, because the
|
||||
LWT coordinator on n3 was fenced out.
|
||||
6. Release the `paxos_accept_proposal_wait` injection. The LWT must fail
|
||||
with a "stale topology exception" because the topology version from the
|
||||
request is older than the current `fence_version`.
|
||||
"""
|
||||
cmdline = [
|
||||
'--logger-log-level', 'paxos=trace',
|
||||
'--smp', '1'
|
||||
]
|
||||
|
||||
logger.info("Bootstrapping the cluster")
|
||||
servers = await manager.servers_add(3,
|
||||
cmdline=cmdline,
|
||||
property_file=[
|
||||
{'dc': 'my_dc', 'rack': 'rack1'},
|
||||
{'dc': 'my_dc', 'rack': 'rack2'},
|
||||
{'dc': 'my_dc', 'rack': 'rack1'}
|
||||
])
|
||||
|
||||
(cql, hosts) = await manager.get_ready_cql(servers)
|
||||
host_ids = await asyncio.gather(*[manager.get_host_id(s.server_id) for s in servers])
|
||||
|
||||
logger.info("Disable tablet balancing")
|
||||
await manager.disable_tablet_balancing()
|
||||
|
||||
logger.info("Create a test keyspace")
|
||||
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") as ks:
|
||||
logger.info("Create test table")
|
||||
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")
|
||||
|
||||
logger.info("Ensure that the tablet replicas are located on n2,n3")
|
||||
[tablet] = await get_all_tablet_replicas(manager, servers[0], ks, 'test')
|
||||
[r1, r2] = tablet.replicas
|
||||
if host_ids[0] in {r1[0], r2[0]}:
|
||||
# the only possibility is r1=n1 && r2=n2, because otherwise two
|
||||
# replicas would be on the same rack
|
||||
await manager.api.move_tablet(servers[0].ip_addr, ks, "test",
|
||||
host_ids[0], 0,
|
||||
host_ids[2], 0,
|
||||
tablet.last_token)
|
||||
|
||||
logger.info(f"Injecting 'paxos_accept_proposal_wait' into {servers[1]}")
|
||||
await manager.api.enable_injection(servers[1].ip_addr, 'paxos_accept_proposal_wait', one_shot=True)
|
||||
|
||||
logger.info(f"Start an LWT on {servers[2]}")
|
||||
insert_lwt = cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (1, 1) IF NOT EXISTS", host=hosts[2])
|
||||
|
||||
logger.info(f"Open log on {servers[1]}")
|
||||
s2_log = await manager.server_open_log(servers[1].server_id)
|
||||
logger.info("Wait for 'paxos_accept_proposal_wait: waiting for message'")
|
||||
await s2_log.wait_for('paxos_accept_proposal_wait: waiting for message')
|
||||
|
||||
logger.info(f"Injecting 'raft_topology_barrier_and_drain_fail_before' into {servers[2]}")
|
||||
await manager.api.enable_injection(servers[2].ip_addr,
|
||||
'raft_topology_barrier_and_drain_fail_before',
|
||||
one_shot=False)
|
||||
|
||||
logger.info(f"Migrate the tablet replica from {servers[2]} to {servers[1]}")
|
||||
await manager.api.move_tablet(servers[0].ip_addr, ks, "test", host_ids[2], 0,
|
||||
host_ids[0], 0, tablet.last_token)
|
||||
|
||||
async def fenced_out_requests():
|
||||
metrics = await manager.metrics.query(servers[1].ip_addr)
|
||||
metric_name = 'scylla_storage_proxy_replica_fenced_out_requests'
|
||||
return metrics.get(metric_name) or 0
|
||||
|
||||
assert await fenced_out_requests() == 0
|
||||
|
||||
logger.info(f"Release 'paxos_accept_proposal_wait' on {servers[1]}")
|
||||
await manager.api.message_injection(servers[1].ip_addr, "paxos_accept_proposal_wait")
|
||||
|
||||
with pytest.raises(WriteFailure, match="stale topology exception"):
|
||||
await insert_lwt
|
||||
|
||||
assert await fenced_out_requests() == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='dev mode is enough for this test')
|
||||
@pytest.mark.skip_mode(mode='debug', reason='dev mode is enough for this test')
|
||||
|
||||
99
test/cluster/test_guardrails.py
Normal file
99
test/cluster/test_guardrails.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
from cassandra.protocol import ConfigurationException, InvalidRequest
|
||||
|
||||
from test.pylib.async_cql import _wrap_future
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.util import unique_name
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def create_ks_and_assert_warning(cql, query, ks_name, key_warn_msg_words):
|
||||
# We have to use `Session::execute_async` here to be able to obtain `warnings`.
|
||||
ret = cql.execute_async(query)
|
||||
await _wrap_future(ret)
|
||||
found = False
|
||||
if len(key_warn_msg_words) > 0:
|
||||
assert len(ret.warnings) >= 1, "Expected RF guardrail warning"
|
||||
for warning in ret.warnings:
|
||||
found = found or all(word in warning.lower() for word in key_warn_msg_words)
|
||||
assert found, "Didn't match all required keywords"
|
||||
await cql.run_async(f"USE {ks_name}")
|
||||
|
||||
|
||||
async def assert_creating_ks_fails(cql, query, ks_name):
|
||||
with pytest.raises(ConfigurationException):
|
||||
await cql.run_async(query)
|
||||
with pytest.raises(InvalidRequest):
|
||||
await cql.run_async(f"USE {ks_name}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_default_rf(manager: ManagerClient):
|
||||
"""
|
||||
As of now, the only RF guardrail enabled is a soft limit checking that RF >= 3. Not complying to this soft limit
|
||||
results in a CQL query being executed, but with a warning. Also, whatever the guardrails' values, RF = 0 is always OK.
|
||||
"""
|
||||
|
||||
# FIXME: This test verifies that guardrails work. However, if we set `rf_rack_valid_keyspaces` to true,
|
||||
# we'll get a different error, so let's disable it for now. For more context, see issues:
|
||||
# scylladb/scylladb#23071 and scylladb/scylla-dtest#5633.
|
||||
cfg = {"rf_rack_valid_keyspaces": False}
|
||||
|
||||
await manager.server_add(config=cfg, property_file={"dc": "dc1", "rack": "r1"})
|
||||
await manager.server_add(config=cfg, property_file={"dc": "dc2", "rack": "r1"})
|
||||
await manager.server_add(config=cfg, property_file={"dc": "dc3", "rack": "r1"})
|
||||
|
||||
cql = manager.get_cql()
|
||||
ks_name = unique_name()
|
||||
rf = {"dc1": 2, "dc2": 3, "dc3": 0}
|
||||
options = ", ".join([f"'{dc}':{rf_val}" for dc, rf_val in rf.items()])
|
||||
query = f"CREATE KEYSPACE {ks_name} WITH REPLICATION={{'class':'NetworkTopologyStrategy', {options}}}"
|
||||
await create_ks_and_assert_warning(cql, query, ks_name, ["warn", "min", "replication", "factor", "3", "dc1", "2"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_all_rf_limits(manager: ManagerClient):
|
||||
"""
|
||||
There are 4 limits for RF: soft/hard min and soft/hard max limits. Breaking soft limits issues a warning,
|
||||
breaking the hard limits prevents the query from being executed.
|
||||
"""
|
||||
MIN_FAIL_THRESHOLD = 2
|
||||
MIN_WARN_THRESHOLD = 3
|
||||
MAX_WARN_THRESHOLD = 4
|
||||
MAX_FAIL_THRESHOLD = 5
|
||||
|
||||
# FIXME: This test verifies that guardrails work. However, if we set `rf_rack_valid_keyspaces` to true,
|
||||
# we'll get a different error, so let's disable it for now. For more context, see issues:
|
||||
# scylladb/scylladb#23071 and scylladb/scylla-dtest#5633.
|
||||
cfg = {
|
||||
"rf_rack_valid_keyspaces": False,
|
||||
"minimum_replication_factor_fail_threshold": MIN_FAIL_THRESHOLD,
|
||||
"minimum_replication_factor_warn_threshold": MIN_WARN_THRESHOLD,
|
||||
"maximum_replication_factor_warn_threshold": MAX_WARN_THRESHOLD,
|
||||
"maximum_replication_factor_fail_threshold": MAX_FAIL_THRESHOLD,
|
||||
}
|
||||
|
||||
dc = "dc1"
|
||||
await manager.server_add(config=cfg, property_file={"dc": dc, "rack": "r1"})
|
||||
cql = manager.get_cql()
|
||||
|
||||
for rf in range(MIN_FAIL_THRESHOLD - 1, MAX_FAIL_THRESHOLD + 1):
|
||||
ks_name = unique_name()
|
||||
query = f"CREATE KEYSPACE {ks_name} WITH REPLICATION = {{'class': 'NetworkTopologyStrategy', '{dc}': {rf}}}"
|
||||
if rf < MIN_FAIL_THRESHOLD or rf > MAX_FAIL_THRESHOLD:
|
||||
await assert_creating_ks_fails(cql, query, ks_name)
|
||||
elif rf < MIN_WARN_THRESHOLD:
|
||||
await create_ks_and_assert_warning(cql, query, ks_name, ["warn", "min", "replication", "factor", str(MIN_WARN_THRESHOLD), dc, str(rf)])
|
||||
elif rf > MAX_WARN_THRESHOLD:
|
||||
await create_ks_and_assert_warning(cql, query, ks_name, ["warn", "max", "replication", "factor", str(MAX_WARN_THRESHOLD), dc, str(rf)])
|
||||
else:
|
||||
await create_ks_and_assert_warning(cql, query, ks_name, [])
|
||||
@@ -90,59 +90,6 @@ def get_keys_from_sst(sst_file, scylla_path):
|
||||
logging.error(f"An unexpected error occurred: {e}")
|
||||
return []
|
||||
|
||||
def local_process_id(cql):
|
||||
ip = socket.gethostbyname(cql.cluster.contact_points[0])
|
||||
port = cql.cluster.port
|
||||
ip2hex = lambda ip: ''.join([f'{int(x):02X}' for x in reversed(ip.split('.'))])
|
||||
port2hex = lambda port: f'{int(port):04X}'
|
||||
addr1 = ip2hex(ip) + ':' + port2hex(port)
|
||||
addr2 = ip2hex('0.0.0.0') + ':' + port2hex(port)
|
||||
LISTEN = '0A'
|
||||
with open('/proc/net/tcp', 'r') as f:
|
||||
for line in f:
|
||||
cols = line.split()
|
||||
if cols[3] == LISTEN and (cols[1] == addr1 or cols[1] == addr2):
|
||||
inode = cols[9]
|
||||
break
|
||||
else:
|
||||
# Didn't find a process listening on the given address
|
||||
return None
|
||||
target = f'socket:[{inode}]'
|
||||
for proc in os.listdir('/proc'):
|
||||
if not proc.isnumeric():
|
||||
continue
|
||||
dir = f'/proc/{proc}/fd/'
|
||||
try:
|
||||
for fd in os.listdir(dir):
|
||||
if os.readlink(dir + fd) == target:
|
||||
# Found the process!
|
||||
return proc
|
||||
except:
|
||||
# Ignore errors. We can't check processes we don't own.
|
||||
pass
|
||||
return None
|
||||
|
||||
def get_scylla_path(cql):
|
||||
max_retries = 5
|
||||
for attempt in range(1, max_retries + 1):
|
||||
pid = local_process_id(cql)
|
||||
if not pid:
|
||||
logger.warning(f"Attempt {attempt}/{max_retries}: Could not get local process ID for CQL. Retrying...")
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
path = None
|
||||
try:
|
||||
path = os.readlink(f'/proc/{pid}/exe')
|
||||
subprocess.check_output([path, '--list-tools'], stderr=subprocess.PIPE)
|
||||
return path
|
||||
except:
|
||||
logger.warning(f"Attempt {attempt}/{max_retries}: Failed to determine or verify Scylla path. Retrying...")
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
assert False, f"Failed to find and verify Scylla executable path after {max_retries} attempts."
|
||||
|
||||
def get_metrics(server, metric_name):
|
||||
num = 0
|
||||
metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text
|
||||
@@ -425,7 +372,7 @@ async def test_tablet_incremental_repair_and_minor(manager: ManagerClient):
|
||||
await tm.drain_module_tasks(server.ip_addr, module_name)
|
||||
|
||||
# Verify repaired and unrepaired keys
|
||||
scylla_path = get_scylla_path(cql)
|
||||
scylla_path = await manager.server_get_exe(servers[0].server_id)
|
||||
|
||||
for server in servers:
|
||||
await manager.server_stop_gracefully(server.server_id)
|
||||
@@ -465,7 +412,7 @@ async def do_test_tablet_incremental_repair_with_split_and_merge(manager, do_spl
|
||||
if do_merge:
|
||||
await trigger_tablet_merge(manager, servers, logs)
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
scylla_path = await manager.server_get_exe(servers[0].server_id)
|
||||
|
||||
await asyncio.sleep(random.randint(1, 5))
|
||||
|
||||
@@ -508,7 +455,7 @@ async def test_tablet_incremental_repair_existing_and_repair_produced_sstable(ma
|
||||
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental')
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
scylla_path = await manager.server_get_exe(servers[0].server_id)
|
||||
|
||||
for server in servers:
|
||||
await manager.server_stop_gracefully(server.server_id)
|
||||
@@ -536,7 +483,7 @@ async def test_tablet_incremental_repair_merge_higher_repaired_at_number(manager
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, incremental_mode='incremental') # sstables_repaired_at 2
|
||||
await inject_error_off(manager, "repair_tablet_no_update_sstables_repair_at", servers)
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
scylla_path = await manager.server_get_exe(servers[0].server_id)
|
||||
|
||||
s1_mark = await logs[0].mark()
|
||||
await trigger_tablet_merge(manager, servers, logs)
|
||||
@@ -577,7 +524,7 @@ async def test_tablet_incremental_repair_merge_correct_repaired_at_number_after_
|
||||
logging.info(f"Start repair for token={t}");
|
||||
await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", t, incremental_mode='incremental') # sstables_repaired_at 3
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
scylla_path = await manager.server_get_exe(servers[0].server_id)
|
||||
|
||||
# Trigger merge
|
||||
await trigger_tablet_merge(manager, servers, logs)
|
||||
@@ -606,7 +553,7 @@ async def do_test_tablet_incremental_repair_merge_error(manager, error):
|
||||
for server in servers:
|
||||
await manager.api.flush_keyspace(server.ip_addr, ks)
|
||||
|
||||
scylla_path = get_scylla_path(cql)
|
||||
scylla_path = await manager.server_get_exe(server.server_id)
|
||||
|
||||
# Trigger merge and error in merge
|
||||
s1_mark = await logs[0].mark()
|
||||
|
||||
@@ -17,9 +17,9 @@ logger = logging.getLogger(__name__)
|
||||
async def test_left_node_notification(manager: ManagerClient) -> None:
|
||||
"""
|
||||
Create a 3-node multi-DC cluster with 2 nodes in dc1 and 1 node in dc2.
|
||||
Then decommission both dc1 nodes, ensuring the topology remains consistent
|
||||
and the remaining node belongs to dc2 and there is only two 'left the cluster'
|
||||
notifications were issued
|
||||
Then decommission both dc1 nodes, ensuring the topology remains consistent,
|
||||
and the remaining node belongs to dc2, and only two 'left the cluster'
|
||||
notifications were issued.
|
||||
"""
|
||||
# Bootstrap 2 nodes in dc1
|
||||
logger.info("Bootstrapping dc1 nodes")
|
||||
@@ -31,6 +31,16 @@ async def test_left_node_notification(manager: ManagerClient) -> None:
|
||||
dc2_node = await manager.server_add(cmdline=["--logger-log-level", "storage_service=debug"],
|
||||
property_file={"dc": "dc2", "rack": "r1"})
|
||||
|
||||
# When table audit is enabled, Scylla creates the "audit" keyspace with
|
||||
# NetworkTopologyStrategy and RF=3 in dc1 only. To avoid decommission failures due to
|
||||
# "zero replica after the removal" or "can not find new node in local dc" errors when
|
||||
# removing dc1 nodes, we alter the audit keyspace to have replicas only in dc2.
|
||||
# Only alter if the audit keyspace exists (it might not exist if audit is disabled).
|
||||
cql = manager.get_cql()
|
||||
result = await cql.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'audit'")
|
||||
if result:
|
||||
await cql.run_async("ALTER KEYSPACE audit WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'dc2': 1}")
|
||||
|
||||
# Ensure ring and group0 are consistent before operations
|
||||
await check_token_ring_and_group0_consistency(manager)
|
||||
|
||||
|
||||
@@ -72,6 +72,15 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
|
||||
hosts = await wait_for_cql_and_get_hosts(cql, live_servers, time.time() + 60)
|
||||
dead_hosts = await wait_for_cql_and_get_hosts(cql, dead_servers, time.time() + 60)
|
||||
|
||||
# When table audit is enabled, Scylla creates the "audit" keyspace with
|
||||
# NetworkTopologyStrategy. During remove_node, streaming fails for the audit keyspace
|
||||
# with "zero replica after the removal" when all nodes from dc2 are removed.
|
||||
# By setting RF=3 only in dc1, we ensure the audit data stays on the surviving nodes.
|
||||
# Only alter if the audit keyspace exists (it might not exist if audit is disabled).
|
||||
result = await cql.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'audit'")
|
||||
if result:
|
||||
await cql.run_async("ALTER KEYSPACE audit WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'dc1': 3}")
|
||||
|
||||
first_group0_id = (await cql.run_async(
|
||||
"SELECT value FROM system.scylla_local WHERE key = 'raft_group0_id'"))[0].value
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ from collections import defaultdict
|
||||
from test.pylib.minio_server import MinioServer
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.object_store.conftest import format_tuples
|
||||
from test.cluster.object_store.test_backup import topo, create_cluster, take_snapshot, create_dataset, check_data_is_back, do_load_sstables, mark_all_logs, check_mutation_replicas
|
||||
from test.cluster.object_store.test_backup import topo, create_cluster, take_snapshot, create_dataset, do_load_sstables, mark_all_logs, check_mutation_replicas, check_streaming_directions
|
||||
from test.cluster.util import wait_for_cql_and_get_hosts
|
||||
from test.pylib.rest_client import read_barrier
|
||||
from test.pylib.util import unique_name
|
||||
@@ -126,7 +126,8 @@ async def test_refresh_with_streaming_scopes(build_mode: str, manager: ManagerCl
|
||||
|
||||
await do_load_sstables(ks, cf, servers, topology, sstables, scope, manager, logger, primary_replica_only=pro, load_fn=do_refresh)
|
||||
|
||||
await check_data_is_back(manager, logger, cql, ks, cf, keys, servers, topology, host_ids, scope, primary_replica_only=pro, log_marks=log_marks)
|
||||
await check_mutation_replicas(cql, manager, servers, keys, topology, logger, ks, cf, scope, primary_replica_only=pro)
|
||||
await check_streaming_directions(logger, servers, topology, host_ids, scope, pro, log_marks)
|
||||
|
||||
shutil.rmtree(tmpbackup)
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ async def test_autoretrain_dict(manager: ManagerClient):
|
||||
uncompressed_size = blob_size * n_blobs * rf
|
||||
|
||||
# Start with compressor without a dictionary
|
||||
cfg = { "sstable_compression_user_table_options": "ZstdCompressor" }
|
||||
cfg = { "sstable_compression_user_table_options": { 'sstable_compression': 'ZstdCompressor' } }
|
||||
|
||||
logger.info("Bootstrapping cluster")
|
||||
servers = await manager.servers_add(2, cmdline=[
|
||||
|
||||
@@ -56,5 +56,8 @@ async def test_table_desc_read_barrier(manager: ManagerClient) -> None:
|
||||
await read_barrier(manager.api, servers[0].ip_addr)
|
||||
|
||||
# verify that there is no schema difference after the read barrier
|
||||
desc_schema = [await cql.run_async("DESC SCHEMA", host=h) for h in hosts]
|
||||
# Sort results by (keyspace_name, type, name) to ensure consistent ordering
|
||||
# since DESC SCHEMA may return results in different order on different nodes
|
||||
desc_schema = [sorted(await cql.run_async("DESC SCHEMA", host=h),
|
||||
key=lambda r: (r.keyspace_name, r.type, r.name)) for h in hosts]
|
||||
assert desc_schema[0] == desc_schema[1]
|
||||
|
||||
@@ -12,7 +12,7 @@ from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.rest_client import inject_error_one_shot, HTTPError, read_barrier
|
||||
from test.pylib.util import wait_for_cql_and_get_hosts, unique_name, wait_for
|
||||
from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas, get_tablet_count, TabletReplicas
|
||||
from test.cluster.util import reconnect_driver, create_new_test_keyspace, new_test_keyspace
|
||||
from test.cluster.util import reconnect_driver, create_new_test_keyspace, new_test_keyspace, get_topology_version
|
||||
from test.cqlpy.cassandra_tests.validation.entities.secondary_index_test import dotestCreateAndDropIndex
|
||||
|
||||
import pytest
|
||||
@@ -26,6 +26,7 @@ from collections import defaultdict
|
||||
from collections.abc import Iterable
|
||||
from contextlib import asynccontextmanager
|
||||
import itertools
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -1977,9 +1978,19 @@ async def test_crash_on_missing_table_from_load_stats(manager: ManagerClient):
|
||||
s0_mark = await s0_log.mark()
|
||||
await s0_log.wait_for('raft topology: Refreshed table load stats for all DC', from_mark=s0_mark)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_timed_out_reader_after_cleanup(manager: ManagerClient):
|
||||
async def test_tablets_barrier_waits_for_replica_erms(manager: ManagerClient):
|
||||
"""
|
||||
The test verifies that tablet replicas hold ERMS while processing requests,
|
||||
and that the tablet's global barrier waits for all replicas to acknowledge it.
|
||||
To do this, the test starts a read request and makes it hang on the
|
||||
`replica_query_wait` injection, then initiates tablet migration. Finally, it
|
||||
checks that the tablet's global barrier waits for the replica handling that
|
||||
request to complete.
|
||||
"""
|
||||
|
||||
logger.info("Bootstrapping cluster")
|
||||
cmdline = [
|
||||
'--logger-log-level', 'storage_service=debug',
|
||||
@@ -2008,7 +2019,6 @@ async def test_timed_out_reader_after_cleanup(manager: ManagerClient):
|
||||
|
||||
replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token)
|
||||
|
||||
s0_host_id = await manager.get_host_id(servers[0].server_id)
|
||||
s1_host_id = await manager.get_host_id(servers[1].server_id)
|
||||
dst_shard = 0
|
||||
|
||||
@@ -2020,13 +2030,19 @@ async def test_timed_out_reader_after_cleanup(manager: ManagerClient):
|
||||
replica_query = cql.run_async(f"SELECT * from {ks}.test where pk={key} BYPASS CACHE", host=hosts[1])
|
||||
await s0_log.wait_for('replica_query_wait: waiting', from_mark=s0_mark)
|
||||
|
||||
await manager.api.enable_injection(servers[0].ip_addr, "tablet_cleanup_completion_wait", one_shot=False)
|
||||
version_before_move = await get_topology_version(cql, hosts[0])
|
||||
|
||||
s0_mark = await s0_log.mark()
|
||||
migration_task = asyncio.create_task(
|
||||
manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token))
|
||||
|
||||
# migration should proceed once replica query times out on coordinator, causing it to be abandoned
|
||||
await s0_log.wait_for('tablet_cleanup_completion_wait: waiting', from_mark=s0_mark)
|
||||
new_version = version_before_move + 1
|
||||
await s0_log.wait_for(re.escape(
|
||||
f"Got raft_topology_cmd::barrier_and_drain, version {new_version}, "
|
||||
f"current version {new_version}, "
|
||||
f"stale versions (version: use_count): {{{version_before_move}: 1}}"),
|
||||
from_mark=s0_mark)
|
||||
|
||||
await manager.api.message_injection(servers[0].ip_addr, "replica_query_wait")
|
||||
await manager.api.disable_injection(servers[0].ip_addr, "replica_query_wait")
|
||||
@@ -2037,7 +2053,6 @@ async def test_timed_out_reader_after_cleanup(manager: ManagerClient):
|
||||
except:
|
||||
pass
|
||||
|
||||
await manager.api.message_injection(servers[0].ip_addr, "tablet_cleanup_completion_wait")
|
||||
logger.info("Waiting for migration to finish")
|
||||
await migration_task
|
||||
logger.info("Migration done")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user