Compare commits

..

1 Commits

Author SHA1 Message Date
Kamil Braun
33901e1681 Fix version numbers in upgrade page title 2022-11-02 15:36:51 +01:00
245 changed files with 3531 additions and 8591 deletions

View File

@@ -1,17 +0,0 @@
name: "Docs / Amplify enhanced"
on: issue_comment
jobs:
build:
runs-on: ubuntu-latest
if: ${{ github.event.issue.pull_request }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Amplify enhanced
env:
TOKEN: ${{ secrets.GITHUB_TOKEN }}
uses: scylladb/sphinx-scylladb-theme/.github/actions/amplify-enhanced@master

3
.gitmodules vendored
View File

@@ -6,6 +6,9 @@
path = swagger-ui
url = ../scylla-swagger-ui
ignore = dirty
[submodule "libdeflate"]
path = libdeflate
url = ../libdeflate
[submodule "abseil"]
path = abseil
url = ../abseil-cpp

View File

@@ -34,7 +34,6 @@
#include "expressions.hh"
#include "conditions.hh"
#include "cql3/constants.hh"
#include "cql3/util.hh"
#include <optional>
#include "utils/overloaded_functor.hh"
#include <seastar/json/json_elements.hh>
@@ -928,10 +927,9 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
if (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
}
sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
if (!view_range_key.empty()) {
where_clause = format("{} AND {} IS NOT NULL", where_clause,
cql3::util::maybe_quote(view_range_key));
where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
}
where_clauses.push_back(std::move(where_clause));
view_builders.emplace_back(std::move(view_builder));
@@ -986,10 +984,9 @@ static future<executor::request_return_type> create_table_on_shard0(tracing::tra
// Note above we don't need to add virtual columns, as all
// base columns were copied to view. TODO: reconsider the need
// for virtual columns when we support Projection.
sstring where_clause = format("{} IS NOT NULL", cql3::util::maybe_quote(view_hash_key));
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
if (!view_range_key.empty()) {
where_clause = format("{} AND {} IS NOT NULL", where_clause,
cql3::util::maybe_quote(view_range_key));
where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
}
where_clauses.push_back(std::move(where_clause));
view_builders.emplace_back(std::move(view_builder));
@@ -3645,7 +3642,7 @@ static future<executor::request_return_type> do_query(service::storage_proxy& pr
if (exclusive_start_key) {
partition_key pk = pk_from_json(*exclusive_start_key, schema);
auto pos = position_in_partition::for_partition_start();
auto pos = position_in_partition(position_in_partition::partition_start_tag_t());
if (schema->clustering_key_size() > 0) {
pos = pos_from_json(*exclusive_start_key, schema);
}

View File

@@ -279,7 +279,7 @@ position_in_partition pos_from_json(const rjson::value& item, schema_ptr schema)
return position_in_partition(region, weight, region == partition_region::clustered ? std::optional(std::move(ck)) : std::nullopt);
}
if (ck.is_empty()) {
return position_in_partition::for_partition_start();
return position_in_partition(position_in_partition::partition_start_tag_t());
}
return position_in_partition::for_key(std::move(ck));
}

View File

@@ -8,7 +8,6 @@
#include <chrono>
#include <cstdint>
#include <exception>
#include <optional>
#include <seastar/core/sstring.hh>
#include <seastar/core/coroutine.hh>
@@ -18,7 +17,6 @@
#include <seastar/coroutine/maybe_yield.hh>
#include <boost/multiprecision/cpp_int.hpp>
#include "exceptions/exceptions.hh"
#include "gms/gossiper.hh"
#include "gms/inet_address.hh"
#include "inet_address_vectors.hh"
@@ -550,26 +548,13 @@ static future<> scan_table_ranges(
co_return;
}
auto units = co_await get_units(page_sem, 1);
// We don't need to limit page size in number of rows because there is
// a builtin limit of the page's size in bytes. Setting this limit to
// 1 is useful for debugging the paging code with moderate-size data.
// We don't to limit page size in number of rows because there is a
// builtin limit of the page's size in bytes. Setting this limit to 1
// is useful for debugging the paging code with moderate-size data.
uint32_t limit = std::numeric_limits<uint32_t>::max();
// Read a page, and if that times out, try again after a small sleep.
// If we didn't catch the timeout exception, it would cause the scan
// be aborted and only be restarted at the next scanning period.
std::unique_ptr<cql3::result_set> rs;
for (;;) {
try {
// FIXME: which timeout?
rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
break;
} catch(exceptions::read_timeout_exception&) {
tlogger.warn("expiration scanner read timed out, will retry: {}",
std::current_exception());
}
// If we didn't break out of this loop, add a minimal sleep
co_await seastar::sleep(1s);
}
// FIXME: which timeout?
// FIXME: if read times out, need to retry it.
std::unique_ptr<cql3::result_set> rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
auto rows = rs->rows();
auto meta = rs->get_metadata().get_names();
std::optional<unsigned> expiration_column;

View File

@@ -1,15 +0,0 @@
version: 1
applications:
- frontend:
phases:
build:
commands:
- make setupenv
- make dirhtml
artifacts:
baseDirectory: _build/dirhtml
files:
- '**/*'
cache:
paths: []
appRoot: docs

View File

@@ -49,14 +49,6 @@
extern logging::logger apilog;
namespace std {
std::ostream& operator<<(std::ostream& os, const api::table_info& ti) {
return os << "table{name=" << ti.name << ", id=" << ti.id << "}";
}
} // namespace std
namespace api {
const locator::token_metadata& http_context::get_token_metadata() {
@@ -108,55 +100,6 @@ std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, con
return parse_tables(ks_name, ctx, it->second);
}
std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, sstring value) {
std::vector<table_info> res;
try {
if (value.empty()) {
const auto& cf_meta_data = ctx.db.local().find_keyspace(ks_name).metadata().get()->cf_meta_data();
res.reserve(cf_meta_data.size());
for (const auto& [name, schema] : cf_meta_data) {
res.emplace_back(table_info{name, schema->id()});
}
} else {
std::vector<sstring> names = split(value, ",");
res.reserve(names.size());
const auto& db = ctx.db.local();
for (const auto& table_name : names) {
res.emplace_back(table_info{table_name, db.find_uuid(ks_name, table_name)});
}
}
} catch (const replica::no_such_keyspace& e) {
throw bad_param_exception(e.what());
} catch (const replica::no_such_column_family& e) {
throw bad_param_exception(e.what());
}
return res;
}
std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name) {
auto it = query_params.find(param_name);
return parse_table_infos(ks_name, ctx, it != query_params.end() ? it->second : "");
}
// Run on all tables, skipping dropped tables
future<> run_on_existing_tables(sstring op, replica::database& db, std::string_view keyspace, const std::vector<table_info> local_tables, std::function<future<> (replica::table&)> func) {
std::exception_ptr ex;
for (const auto& ti : local_tables) {
apilog.debug("Starting {} on {}.{}", op, keyspace, ti);
try {
co_await func(db.find_column_family(ti.id));
} catch (const replica::no_such_column_family& e) {
apilog.warn("Skipping {} of {}.{}: {}", op, keyspace, ti, e.what());
} catch (...) {
ex = std::current_exception();
apilog.error("Failed {} of {}.{}: {}", op, keyspace, ti, ex);
}
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
}
}
static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
ss::token_range r;
r.start_token = d._start_token;
@@ -175,13 +118,16 @@ static ss::token_range token_range_endpoints_to_json(const dht::token_range_endp
return r;
}
using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<table_info>)>;
using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
return [&ctx, f = std::move(f)](std::unique_ptr<request> req) {
auto keyspace = validate_keyspace(ctx, req->param);
auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
return f(ctx, std::move(req), std::move(keyspace), std::move(table_infos));
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
if (column_families.empty()) {
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
}
return f(ctx, std::move(req), std::move(keyspace), std::move(column_families));
};
}
@@ -663,112 +609,93 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
});
});
ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
ss::force_keyspace_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
auto keyspace = validate_keyspace(ctx, req->param);
auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
apilog.debug("force_keyspace_compaction: keyspace={} tables={}", keyspace, table_infos);
try {
co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
auto local_tables = table_infos;
// major compact smaller tables first, to increase chances of success if low on space.
std::ranges::sort(local_tables, std::less<>(), [&] (const table_info& ti) {
try {
return db.find_column_family(ti.id).get_stats().live_disk_space_used;
} catch (const replica::no_such_column_family& e) {
return int64_t(-1);
}
});
co_await run_on_existing_tables("force_keyspace_compaction", db, keyspace, local_tables, [] (replica::table& t) {
return t.compact_all_sstables();
});
});
} catch (...) {
apilog.error("force_keyspace_compaction: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
throw;
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
if (column_families.empty()) {
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
}
co_return json_void();
return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
auto table_ids = boost::copy_range<std::vector<table_id>>(column_families | boost::adaptors::transformed([&] (auto& cf_name) {
return db.find_uuid(keyspace, cf_name);
}));
// major compact smaller tables first, to increase chances of success if low on space.
std::ranges::sort(table_ids, std::less<>(), [&] (const table_id& id) {
return db.find_column_family(id).get_stats().live_disk_space_used;
});
// as a table can be dropped during loop below, let's find it before issuing major compaction request.
for (auto& id : table_ids) {
co_await db.find_column_family(id).compact_all_sstables();
}
co_return;
}).then([]{
return make_ready_future<json::json_return_type>(json_void());
});
});
ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
ss::force_keyspace_cleanup.set(r, [&ctx, &ss](std::unique_ptr<request> req) {
auto keyspace = validate_keyspace(ctx, req->param);
auto table_infos = parse_table_infos(keyspace, ctx, req->query_parameters, "cf");
apilog.info("force_keyspace_cleanup: keyspace={} tables={}", keyspace, table_infos);
if (!co_await ss.local().is_cleanup_allowed(keyspace)) {
auto msg = "Can not perform cleanup operation when topology changes";
apilog.warn("force_keyspace_cleanup: keyspace={} tables={}: {}", keyspace, table_infos, msg);
co_await coroutine::return_exception(std::runtime_error(msg));
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
if (column_families.empty()) {
column_families = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
}
try {
co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
auto local_tables = table_infos;
return ss.local().is_cleanup_allowed(keyspace).then([&ctx, keyspace,
column_families = std::move(column_families)] (bool is_cleanup_allowed) mutable {
if (!is_cleanup_allowed) {
return make_exception_future<json::json_return_type>(
std::runtime_error("Can not perform cleanup operation when topology changes"));
}
return ctx.db.invoke_on_all([keyspace, column_families] (replica::database& db) -> future<> {
auto table_ids = boost::copy_range<std::vector<table_id>>(column_families | boost::adaptors::transformed([&] (auto& table_name) {
return db.find_uuid(keyspace, table_name);
}));
// cleanup smaller tables first, to increase chances of success if low on space.
std::ranges::sort(local_tables, std::less<>(), [&] (const table_info& ti) {
try {
return db.find_column_family(ti.id).get_stats().live_disk_space_used;
} catch (const replica::no_such_column_family& e) {
return int64_t(-1);
}
std::ranges::sort(table_ids, std::less<>(), [&] (const table_id& id) {
return db.find_column_family(id).get_stats().live_disk_space_used;
});
auto& cm = db.get_compaction_manager();
auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
co_await run_on_existing_tables("force_keyspace_cleanup", db, keyspace, local_tables, [&] (replica::table& t) {
return t.perform_cleanup_compaction(owned_ranges_ptr);
});
// as a table can be dropped during loop below, let's find it before issuing the cleanup request.
for (auto& id : table_ids) {
replica::table& t = db.find_column_family(id);
co_await cm.perform_cleanup(owned_ranges_ptr, t.as_table_state());
}
co_return;
}).then([]{
return make_ready_future<json::json_return_type>(0);
});
} catch (...) {
apilog.error("force_keyspace_cleanup: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
throw;
}
co_return json::json_return_type(0);
});
});
ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
apilog.info("perform_keyspace_offstrategy_compaction: keyspace={} tables={}", keyspace, table_infos);
bool res = false;
try {
res = co_await ctx.db.map_reduce0([&] (replica::database& db) -> future<bool> {
bool needed = false;
co_await run_on_existing_tables("perform_keyspace_offstrategy_compaction", db, keyspace, table_infos, [&needed] (replica::table& t) -> future<> {
needed |= co_await t.perform_offstrategy_compaction();
});
co_return needed;
}, false, std::plus<bool>());
} catch (...) {
apilog.error("perform_keyspace_offstrategy_compaction: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
throw;
}
co_return json::json_return_type(res);
ss::perform_keyspace_offstrategy_compaction.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> tables) -> future<json::json_return_type> {
co_return co_await ctx.db.map_reduce0([&keyspace, &tables] (replica::database& db) -> future<bool> {
bool needed = false;
for (const auto& table : tables) {
auto& t = db.find_column_family(keyspace, table);
needed |= co_await t.perform_offstrategy_compaction();
}
co_return needed;
}, false, std::plus<bool>());
}));
ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<table_info> table_infos) -> future<json::json_return_type> {
auto& db = ctx.db;
ss::upgrade_sstables.set(r, wrap_ks_cf(ctx, [] (http_context& ctx, std::unique_ptr<request> req, sstring keyspace, std::vector<sstring> column_families) {
bool exclude_current_version = req_param<bool>(*req, "exclude_current_version", false);
apilog.info("upgrade_sstables: keyspace={} tables={} exclude_current_version={}", keyspace, table_infos, exclude_current_version);
try {
co_await db.invoke_on_all([&] (replica::database& db) -> future<> {
auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
co_await run_on_existing_tables("upgrade_sstables", db, keyspace, table_infos, [&] (replica::table& t) {
return t.get_compaction_manager().perform_sstable_upgrade(owned_ranges_ptr, t.as_table_state(), exclude_current_version);
});
return ctx.db.invoke_on_all([=] (replica::database& db) {
auto owned_ranges_ptr = compaction::make_owned_ranges_ptr(db.get_keyspace_local_ranges(keyspace));
return do_for_each(column_families, [=, &db](sstring cfname) {
auto& cm = db.get_compaction_manager();
auto& cf = db.find_column_family(keyspace, cfname);
return cm.perform_sstable_upgrade(owned_ranges_ptr, cf.as_table_state(), exclude_current_version);
});
} catch (...) {
apilog.error("upgrade_sstables: keyspace={} tables={} failed: {}", keyspace, table_infos, std::current_exception());
throw;
}
co_return json::json_return_type(0);
}).then([]{
return make_ready_future<json::json_return_type>(0);
});
}));
ss::force_keyspace_flush.set(r, [&ctx](std::unique_ptr<request> req) -> future<json::json_return_type> {
auto keyspace = validate_keyspace(ctx, req->param);
auto column_families = parse_tables(keyspace, ctx, req->query_parameters, "cf");
apilog.info("perform_keyspace_flush: keyspace={} tables={}", keyspace, column_families);
auto& db = ctx.db;
if (column_families.empty()) {
co_await replica::database::flush_keyspace_on_all_shards(db, keyspace);
@@ -780,7 +707,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::decommission.set(r, [&ss](std::unique_ptr<request> req) {
apilog.info("decommission");
return ss.local().decommission().then([] {
return make_ready_future<json::json_return_type>(json_void());
});
@@ -796,7 +722,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::remove_node.set(r, [&ss](std::unique_ptr<request> req) {
auto host_id = validate_host_id(req->get_query_param("host_id"));
std::vector<sstring> ignore_nodes_strs= split(req->get_query_param("ignore_nodes"), ",");
apilog.info("remove_node: host_id={} ignore_nodes={}", host_id, ignore_nodes_strs);
auto ignore_nodes = std::list<locator::host_id_or_endpoint>();
for (std::string n : ignore_nodes_strs) {
try {
@@ -872,7 +797,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
});
ss::drain.set(r, [&ss](std::unique_ptr<request> req) {
apilog.info("drain");
return ss.local().drain().then([] {
return make_ready_future<json::json_return_type>(json_void());
});
@@ -896,14 +820,12 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
});
ss::stop_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
apilog.info("stop_gossiping");
return ss.local().stop_gossiping().then([] {
return make_ready_future<json::json_return_type>(json_void());
});
});
ss::start_gossiping.set(r, [&ss](std::unique_ptr<request> req) {
apilog.info("start_gossiping");
return ss.local().start_gossiping().then([] {
return make_ready_future<json::json_return_type>(json_void());
});
@@ -1006,7 +928,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::rebuild.set(r, [&ss](std::unique_ptr<request> req) {
auto source_dc = req->get_query_param("source_dc");
apilog.info("rebuild: source_dc={}", source_dc);
return ss.local().rebuild(std::move(source_dc)).then([] {
return make_ready_future<json::json_return_type>(json_void());
});
@@ -1043,7 +964,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
// FIXME: We should truncate schema tables if more than one node in the cluster.
auto& sp = service::get_storage_proxy();
auto& fs = sp.local().features();
apilog.info("reset_local_schema");
return db::schema_tables::recalculate_schema_version(sys_ks, sp, fs).then([] {
return make_ready_future<json::json_return_type>(json_void());
});
@@ -1051,7 +971,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
auto probability = req->get_query_param("probability");
apilog.info("set_trace_probability: probability={}", probability);
return futurize_invoke([probability] {
double real_prob = std::stod(probability.c_str());
return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
@@ -1089,7 +1008,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
auto ttl = req->get_query_param("ttl");
auto threshold = req->get_query_param("threshold");
auto fast = req->get_query_param("fast");
apilog.info("set_slow_query: enable={} ttl={} threshold={} fast={}", enable, ttl, threshold, fast);
try {
return tracing::tracing::tracing_instance().invoke_on_all([enable, ttl, threshold, fast] (auto& local_tracing) {
if (threshold != "") {
@@ -1116,7 +1034,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
auto keyspace = validate_keyspace(ctx, req->param);
auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
apilog.info("enable_auto_compaction: keyspace={} tables={}", keyspace, tables);
return set_tables_autocompaction(ctx, keyspace, tables, true);
});
@@ -1124,7 +1041,6 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
auto keyspace = validate_keyspace(ctx, req->param);
auto tables = parse_tables(keyspace, ctx, req->query_parameters, "cf");
apilog.info("disable_auto_compaction: keyspace={} tables={}", keyspace, tables);
return set_tables_autocompaction(ctx, keyspace, tables, false);
});
@@ -1450,8 +1366,7 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
});
});
ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<request> req) -> future<json::json_return_type> {
auto& db = ctx.db;
ss::scrub.set(r, [&ctx, &snap_ctl] (std::unique_ptr<request> req) {
auto rp = req_params({
{"keyspace", {mandatory::yes}},
{"cf", {""}},
@@ -1487,9 +1402,10 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
}
}
auto f = make_ready_future<>();
if (!req_param<bool>(*req, "disable_snapshot", false)) {
auto tag = format("pre-scrub-{:d}", db_clock::now().time_since_epoch().count());
co_await coroutine::parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
f = parallel_for_each(column_families, [&snap_ctl, keyspace, tag](sstring cf) {
// We always pass here db::snapshot_ctl::snap_views::no since:
// 1. When scrubbing particular tables, there's no need to auto-snapshot their views.
// 2. When scrubbing the whole keyspace, column_families will contain both base tables and views.
@@ -1518,25 +1434,28 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
return stats;
};
try {
auto opt_stats = co_await db.map_reduce0([&] (replica::database& db) {
return map_reduce(column_families, [&] (sstring cfname) {
return f.then([&ctx, keyspace, column_families, opts, &reduce_compaction_stats] {
return ctx.db.map_reduce0([=] (replica::database& db) {
return map_reduce(column_families, [=, &db] (sstring cfname) {
auto& cm = db.get_compaction_manager();
auto& cf = db.find_column_family(keyspace, cfname);
return cm.perform_sstable_scrub(cf.as_table_state(), opts);
}, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
}, std::make_optional(sstables::compaction_stats{}), reduce_compaction_stats);
if (opt_stats && opt_stats->validation_errors) {
co_return json::json_return_type(static_cast<int>(scrub_status::validation_errors));
}).then_wrapped([] (auto f) {
if (f.failed()) {
auto ex = f.get_exception();
if (try_catch<sstables::compaction_aborted_exception>(ex)) {
return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::aborted));
} else {
return make_exception_future<json::json_return_type>(std::move(ex));
}
} else if (f.get()->validation_errors) {
return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::validation_errors));
} else {
return make_ready_future<json::json_return_type>(static_cast<int>(scrub_status::successful));
}
} catch (const sstables::compaction_aborted_exception&) {
co_return json::json_return_type(static_cast<int>(scrub_status::aborted));
} catch (...) {
apilog.error("scrub keyspace={} tables={} failed: {}", keyspace, column_families, std::current_exception());
throw;
}
co_return json::json_return_type(static_cast<int>(scrub_status::successful));
});
});
}

View File

@@ -8,8 +8,6 @@
#pragma once
#include <iostream>
#include <seastar/core/sharded.hh>
#include "api.hh"
#include "db/data_listeners.hh"
@@ -43,22 +41,8 @@ sstring validate_keyspace(http_context& ctx, const parameters& param);
// splits a request parameter assumed to hold a comma-separated list of table names
// verify that the tables are found, otherwise a bad_param_exception exception is thrown
// containing the description of the respective no_such_column_family error.
// Returns an empty vector if no parameter was found.
// If the parameter is found and empty, returns a list of all table names in the keyspace.
std::vector<sstring> parse_tables(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);
struct table_info {
sstring name;
table_id id;
};
// splits a request parameter assumed to hold a comma-separated list of table names
// verify that the tables are found, otherwise a bad_param_exception exception is thrown
// containing the description of the respective no_such_column_family error.
// Returns a vector of all table infos given by the parameter, or
// if the parameter is not found or is empty, returns a list of all table infos in the keyspace.
std::vector<table_info> parse_table_infos(const sstring& ks_name, http_context& ctx, const std::unordered_map<sstring, sstring>& query_params, sstring param_name);
void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_service>& ss, gms::gossiper& g, sharded<cdc::generation_service>& cdc_gs, sharded<db::system_keyspace>& sys_ls);
void set_sstables_loader(http_context& ctx, routes& r, sharded<sstables_loader>& sst_loader);
void unset_sstables_loader(http_context& ctx, routes& r);
@@ -74,10 +58,4 @@ void set_snapshot(http_context& ctx, routes& r, sharded<db::snapshot_ctl>& snap_
void unset_snapshot(http_context& ctx, routes& r);
seastar::future<json::json_return_type> run_toppartitions_query(db::toppartitions_query& q, http_context &ctx, bool legacy_request = false);
} // namespace api
namespace std {
std::ostream& operator<<(std::ostream& os, const api::table_info& ti);
} // namespace std
}

View File

@@ -99,7 +99,7 @@ void set_task_manager_test(http_context& ctx, routes& r, db::config& cfg) {
tmt::get_and_update_ttl.set(r, [&ctx, &cfg] (std::unique_ptr<request> req) -> future<json::json_return_type> {
uint32_t ttl = cfg.task_ttl_seconds();
co_await cfg.task_ttl_seconds.set_value_on_all_shards(req->query_parameters["ttl"], utils::config_file::config_source::API);
cfg.task_ttl_seconds.set(boost::lexical_cast<uint32_t>(req->query_parameters["ttl"]));
co_return json::json_return_type(ttl);
});
}

View File

@@ -28,7 +28,6 @@
#include <seastar/util/closeable.hh>
#include <seastar/core/shared_ptr.hh>
#include "dht/i_partitioner.hh"
#include "sstables/sstables.hh"
#include "sstables/sstable_writer.hh"
#include "sstables/progress_monitor.hh"
@@ -42,7 +41,6 @@
#include "mutation_compactor.hh"
#include "leveled_manifest.hh"
#include "dht/token.hh"
#include "dht/partition_filter.hh"
#include "mutation_writer/shard_based_splitting_writer.hh"
#include "mutation_writer/partition_based_splitting_writer.hh"
#include "mutation_source_metadata.hh"
@@ -222,13 +220,13 @@ public:
~compaction_write_monitor() {
if (_sst) {
_table_s.get_backlog_tracker().revert_charges(_sst);
_table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
}
}
virtual void on_write_started(const sstables::writer_offset_tracker& tracker) override {
_tracker = &tracker;
_table_s.get_backlog_tracker().register_partially_written_sstable(_sst, *this);
_table_s.get_compaction_strategy().get_backlog_tracker().register_partially_written_sstable(_sst, *this);
}
virtual void on_data_write_completed() override {
@@ -353,7 +351,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
public:
virtual void on_read_started(const sstables::reader_position_tracker& tracker) override {
_tracker = &tracker;
_table_s.get_backlog_tracker().register_compacting_sstable(_sst, *this);
_table_s.get_compaction_strategy().get_backlog_tracker().register_compacting_sstable(_sst, *this);
}
virtual void on_read_completed() override {
@@ -372,7 +370,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
void remove_sstable() {
if (_sst) {
_table_s.get_backlog_tracker().revert_charges(_sst);
_table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
}
_sst = {};
}
@@ -384,7 +382,7 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
// We failed to finish handling this SSTable, so we have to update the backlog_tracker
// about it.
if (_sst) {
_table_s.get_backlog_tracker().revert_charges(_sst);
_table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
}
}
@@ -950,7 +948,7 @@ void compacted_fragments_writer::consume_new_partition(const dht::decorated_key&
.dk = dk,
.tombstone = tombstone(),
.current_emitted_tombstone = tombstone(),
.last_pos = position_in_partition::for_partition_start(),
.last_pos = position_in_partition(position_in_partition::partition_start_tag_t()),
.is_splitting_partition = false
};
do_consume_new_partition(dk);
@@ -1175,8 +1173,30 @@ private:
};
class cleanup_compaction final : public regular_compaction {
class incremental_owned_ranges_checker {
const dht::token_range_vector& _sorted_owned_ranges;
mutable dht::token_range_vector::const_iterator _it;
public:
incremental_owned_ranges_checker(const dht::token_range_vector& sorted_owned_ranges)
: _sorted_owned_ranges(sorted_owned_ranges)
, _it(_sorted_owned_ranges.begin()) {
}
// Must be called with increasing token values.
bool belongs_to_current_node(const dht::token& t) const {
// While token T is after a range Rn, advance the iterator.
// iterator will be stopped at a range which either overlaps with T (if T belongs to node),
// or at a range which is after T (if T doesn't belong to this node).
while (_it != _sorted_owned_ranges.end() && _it->after(t, dht::token_comparator())) {
_it++;
}
return _it != _sorted_owned_ranges.end() && _it->contains(t, dht::token_comparator());
}
};
owned_ranges_ptr _owned_ranges;
mutable dht::incremental_owned_ranges_checker _owned_ranges_checker;
incremental_owned_ranges_checker _owned_ranges_checker;
private:
// Called in a seastar thread
dht::partition_range_vector
@@ -1189,8 +1209,21 @@ private:
return dht::partition_range::make({sst->get_first_decorated_key(), true},
{sst->get_last_decorated_key(), true});
}));
// optimize set of potentially overlapping ranges by deoverlapping them.
non_owned_ranges = dht::partition_range::deoverlap(std::move(non_owned_ranges), dht::ring_position_comparator(*_schema));
return dht::subtract_ranges(*_schema, non_owned_ranges, std::move(owned_ranges)).get();
// subtract *each* owned range from the partition range of *each* sstable*,
// such that we'll be left only with a set of non-owned ranges.
for (auto& owned_range : owned_ranges) {
dht::partition_range_vector new_non_owned_ranges;
for (auto& non_owned_range : non_owned_ranges) {
auto ret = non_owned_range.subtract(owned_range, dht::ring_position_comparator(*_schema));
new_non_owned_ranges.insert(new_non_owned_ranges.end(), ret.begin(), ret.end());
seastar::thread::maybe_yield();
}
non_owned_ranges = std::move(new_non_owned_ranges);
}
return non_owned_ranges;
}
protected:
virtual compaction_completion_desc

View File

@@ -80,10 +80,8 @@ struct compaction_data {
}
void stop(sstring reason) {
if (!abort.abort_requested()) {
stop_requested = std::move(reason);
abort.request_abort();
}
stop_requested = std::move(reason);
abort.request_abort();
}
};

View File

@@ -66,8 +66,7 @@ public:
};
compaction_backlog_tracker(std::unique_ptr<impl> impl) : _impl(std::move(impl)) {}
compaction_backlog_tracker(compaction_backlog_tracker&&);
compaction_backlog_tracker& operator=(compaction_backlog_tracker&&) noexcept;
compaction_backlog_tracker(compaction_backlog_tracker&&) = default;
compaction_backlog_tracker(const compaction_backlog_tracker&) = delete;
~compaction_backlog_tracker();
@@ -75,7 +74,7 @@ public:
void replace_sstables(const std::vector<sstables::shared_sstable>& old_ssts, const std::vector<sstables::shared_sstable>& new_ssts);
void register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp);
void register_compacting_sstable(sstables::shared_sstable sst, backlog_read_progress_manager& rp);
void copy_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true) const;
void transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true);
void revert_charges(sstables::shared_sstable sst);
void disable() {

View File

@@ -1097,12 +1097,7 @@ private:
compaction::table_state& t = *_compacting_table;
const auto& maintenance_sstables = t.maintenance_sstable_set();
// Filter out sstables that require view building, to avoid a race between off-strategy
// and view building. Refs: #11882
const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all()
| boost::adaptors::filtered([] (const sstables::shared_sstable& sst) {
return !sst->requires_view_building();
}));
const auto old_sstables = boost::copy_range<std::vector<sstables::shared_sstable>>(*maintenance_sstables.all());
std::vector<sstables::shared_sstable> reshape_candidates = old_sstables;
std::vector<sstables::shared_sstable> sstables_to_remove;
std::unordered_set<sstables::shared_sstable> new_unused_sstables;
@@ -1475,8 +1470,10 @@ private:
bool needs_cleanup(const sstables::shared_sstable& sst,
const dht::token_range_vector& sorted_owned_ranges,
schema_ptr s) {
auto first_token = sst->get_first_decorated_key().token();
auto last_token = sst->get_last_decorated_key().token();
auto first = sst->get_first_partition_key();
auto last = sst->get_last_partition_key();
auto first_token = dht::get_token(*s, first);
auto last_token = dht::get_token(*s, last);
dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);
auto r = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), first_token,
@@ -1576,13 +1573,8 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_sst
}, can_purge_tombstones::no);
}
compaction_manager::compaction_state::compaction_state(table_state& t)
: backlog_tracker(t.get_compaction_strategy().make_backlog_tracker())
{
}
void compaction_manager::add(compaction::table_state& t) {
auto [_, inserted] = _compaction_state.insert({&t, compaction_state(t)});
auto [_, inserted] = _compaction_state.insert({&t, compaction_state{}});
if (!inserted) {
auto s = t.schema();
on_internal_error(cmlog, format("compaction_state for table {}.{} [{}] already exists", s->ks_name(), s->cf_name(), fmt::ptr(&t)));
@@ -1590,21 +1582,22 @@ void compaction_manager::add(compaction::table_state& t) {
}
future<> compaction_manager::remove(compaction::table_state& t) noexcept {
auto& c_state = get_compaction_state(&t);
auto handle = _compaction_state.extract(&t);
// We need to guarantee that a task being stopped will not retry to compact
// a table being removed.
// The requirement above is provided by stop_ongoing_compactions().
_postponed.erase(&t);
if (!handle.empty()) {
auto& c_state = handle.mapped();
// Wait for all compaction tasks running under gate to terminate
// and prevent new tasks from entering the gate.
co_await seastar::when_all_succeed(stop_ongoing_compactions("table removal", &t), c_state.gate.close()).discard_result();
// We need to guarantee that a task being stopped will not retry to compact
// a table being removed.
// The requirement above is provided by stop_ongoing_compactions().
_postponed.erase(&t);
c_state.backlog_tracker.disable();
_compaction_state.erase(&t);
// Wait for the termination of an ongoing compaction on table T, if any.
co_await stop_ongoing_compactions("table removal", &t);
// Wait for all functions running under gate to terminate.
co_await c_state.gate.close();
}
#ifdef DEBUG
auto found = false;
sstring msg;
@@ -1763,7 +1756,7 @@ void compaction_backlog_tracker::register_compacting_sstable(sstables::shared_ss
}
}
void compaction_backlog_tracker::copy_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges) const {
void compaction_backlog_tracker::transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges) {
for (auto&& w : _ongoing_writes) {
new_bt.register_partially_written_sstable(w.first, *w.second);
}
@@ -1773,6 +1766,8 @@ void compaction_backlog_tracker::copy_ongoing_charges(compaction_backlog_tracker
new_bt.register_compacting_sstable(w.first, *w.second);
}
}
_ongoing_writes = {};
_ongoing_compactions = {};
}
void compaction_backlog_tracker::revert_charges(sstables::shared_sstable sst) {
@@ -1780,26 +1775,6 @@ void compaction_backlog_tracker::revert_charges(sstables::shared_sstable sst) {
_ongoing_compactions.erase(sst);
}
compaction_backlog_tracker::compaction_backlog_tracker(compaction_backlog_tracker&& other)
: _impl(std::move(other._impl))
, _ongoing_writes(std::move(other._ongoing_writes))
, _ongoing_compactions(std::move(other._ongoing_compactions))
, _manager(std::exchange(other._manager, nullptr)) {
}
compaction_backlog_tracker&
compaction_backlog_tracker::operator=(compaction_backlog_tracker&& x) noexcept {
if (this != &x) {
if (auto manager = std::exchange(_manager, x._manager)) {
manager->remove_backlog_tracker(this);
}
_impl = std::move(x._impl);
_ongoing_writes = std::move(x._ongoing_writes);
_ongoing_compactions = std::move(x._ongoing_compactions);
}
return *this;
}
compaction_backlog_tracker::~compaction_backlog_tracker() {
if (_manager) {
_manager->remove_backlog_tracker(this);
@@ -1837,14 +1812,3 @@ compaction_backlog_manager::~compaction_backlog_manager() {
tracker->_manager = nullptr;
}
}
void compaction_manager::register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker) {
auto& cs = get_compaction_state(&t);
cs.backlog_tracker = std::move(new_backlog_tracker);
register_backlog_tracker(cs.backlog_tracker);
}
compaction_backlog_tracker& compaction_manager::get_backlog_tracker(compaction::table_state& t) {
auto& cs = get_compaction_state(&t);
return cs.backlog_tracker;
}

View File

@@ -83,9 +83,7 @@ private:
// Signaled whenever a compaction task completes.
condition_variable compaction_done;
compaction_backlog_tracker backlog_tracker;
explicit compaction_state(table_state& t);
compaction_state() = default;
compaction_state(compaction_state&&) = default;
~compaction_state();
@@ -526,9 +524,6 @@ public:
void register_backlog_tracker(compaction_backlog_tracker& backlog_tracker) {
_backlog_manager.register_backlog_tracker(backlog_tracker);
}
void register_backlog_tracker(compaction::table_state& t, compaction_backlog_tracker new_backlog_tracker);
compaction_backlog_tracker& get_backlog_tracker(compaction::table_state& t);
static sstables::compaction_data create_compaction_data();

View File

@@ -427,6 +427,14 @@ struct null_backlog_tracker final : public compaction_backlog_tracker::impl {
virtual void replace_sstables(std::vector<sstables::shared_sstable> old_ssts, std::vector<sstables::shared_sstable> new_ssts) override {}
};
// Just so that if we have more than one CF with NullStrategy, we don't create a lot
// of objects to iterate over for no reason
// Still thread local because of make_unique. But this will disappear soon
static thread_local compaction_backlog_tracker null_backlog_tracker(std::make_unique<null_backlog_tracker>());
compaction_backlog_tracker& get_null_backlog_tracker() {
return null_backlog_tracker;
}
//
// Null compaction strategy is the default compaction strategy.
// As the name implies, it does nothing.
@@ -445,8 +453,8 @@ public:
return compaction_strategy_type::null;
}
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override {
return std::make_unique<null_backlog_tracker>();
virtual compaction_backlog_tracker& get_backlog_tracker() override {
return get_null_backlog_tracker();
}
};
@@ -454,14 +462,11 @@ leveled_compaction_strategy::leveled_compaction_strategy(const std::map<sstring,
: compaction_strategy_impl(options)
, _max_sstable_size_in_mb(calculate_max_sstable_size_in_mb(compaction_strategy_impl::get_value(options, SSTABLE_SIZE_OPTION)))
, _stcs_options(options)
, _backlog_tracker(std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options))
{
_compaction_counter.resize(leveled_manifest::MAX_LEVELS);
}
std::unique_ptr<compaction_backlog_tracker::impl> leveled_compaction_strategy::make_backlog_tracker() {
return std::make_unique<leveled_compaction_backlog_tracker>(_max_sstable_size_in_mb, _stcs_options);
}
int32_t
leveled_compaction_strategy::calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const {
using namespace cql3::statements;
@@ -481,6 +486,7 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
: compaction_strategy_impl(options)
, _options(options)
, _stcs_options(options)
, _backlog_tracker(std::make_unique<time_window_backlog_tracker>(_options, _stcs_options))
{
if (!options.contains(TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.contains(TOMBSTONE_THRESHOLD_OPTION)) {
_disable_tombstone_compaction = true;
@@ -491,10 +497,6 @@ time_window_compaction_strategy::time_window_compaction_strategy(const std::map<
_use_clustering_key_filter = true;
}
std::unique_ptr<compaction_backlog_tracker::impl> time_window_compaction_strategy::make_backlog_tracker() {
return std::make_unique<time_window_backlog_tracker>(_options, _stcs_options);
}
} // namespace sstables
std::vector<sstables::shared_sstable>
@@ -638,6 +640,7 @@ namespace sstables {
date_tiered_compaction_strategy::date_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
: compaction_strategy_impl(options)
, _manifest(options)
, _backlog_tracker(std::make_unique<unimplemented_backlog_tracker>())
{
clogger.warn("DateTieredCompactionStrategy is deprecated. Usually cases for which it is used are better handled by TimeWindowCompactionStrategy."
" Please change your compaction strategy to TWCS as DTCS will be retired in the near future");
@@ -682,23 +685,17 @@ compaction_descriptor date_tiered_compaction_strategy::get_sstables_for_compacti
return sstables::compaction_descriptor({ *it }, service::get_local_compaction_priority());
}
std::unique_ptr<compaction_backlog_tracker::impl> date_tiered_compaction_strategy::make_backlog_tracker() {
return std::make_unique<unimplemented_backlog_tracker>();
}
size_tiered_compaction_strategy::size_tiered_compaction_strategy(const std::map<sstring, sstring>& options)
: compaction_strategy_impl(options)
, _options(options)
, _backlog_tracker(std::make_unique<size_tiered_backlog_tracker>(_options))
{}
size_tiered_compaction_strategy::size_tiered_compaction_strategy(const size_tiered_compaction_strategy_options& options)
: _options(options)
, _backlog_tracker(std::make_unique<size_tiered_backlog_tracker>(_options))
{}
std::unique_ptr<compaction_backlog_tracker::impl> size_tiered_compaction_strategy::make_backlog_tracker() {
return std::make_unique<size_tiered_backlog_tracker>(_options);
}
compaction_strategy::compaction_strategy(::shared_ptr<compaction_strategy_impl> impl)
: _compaction_strategy_impl(std::move(impl)) {}
compaction_strategy::compaction_strategy() = default;
@@ -739,8 +736,8 @@ bool compaction_strategy::use_clustering_key_filter() const {
return _compaction_strategy_impl->use_clustering_key_filter();
}
compaction_backlog_tracker compaction_strategy::make_backlog_tracker() {
return compaction_backlog_tracker(_compaction_strategy_impl->make_backlog_tracker());
compaction_backlog_tracker& compaction_strategy::get_backlog_tracker() {
return _compaction_strategy_impl->get_backlog_tracker();
}
sstables::compaction_descriptor

View File

@@ -106,7 +106,7 @@ public:
sstable_set make_sstable_set(schema_ptr schema) const;
compaction_backlog_tracker make_backlog_tracker();
compaction_backlog_tracker& get_backlog_tracker();
uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);

View File

@@ -22,6 +22,8 @@ class strategy_control;
namespace sstables {
compaction_backlog_tracker& get_unimplemented_backlog_tracker();
class sstable_set_impl;
class resharding_descriptor;
@@ -68,7 +70,7 @@ public:
// droppable tombstone histogram and gc_before.
bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state);
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() = 0;
virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);

View File

@@ -259,6 +259,7 @@ namespace sstables {
class date_tiered_compaction_strategy : public compaction_strategy_impl {
date_tiered_manifest _manifest;
compaction_backlog_tracker _backlog_tracker;
public:
date_tiered_compaction_strategy(const std::map<sstring, sstring>& options);
virtual compaction_descriptor get_sstables_for_compaction(table_state& table_s, strategy_control& control, std::vector<sstables::shared_sstable> candidates) override;
@@ -271,7 +272,9 @@ public:
return compaction_strategy_type::date_tiered;
}
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;
virtual compaction_backlog_tracker& get_backlog_tracker() override {
return _backlog_tracker;
}
};
}

View File

@@ -35,6 +35,7 @@ class leveled_compaction_strategy : public compaction_strategy_impl {
std::optional<std::vector<std::optional<dht::decorated_key>>> _last_compacted_keys;
std::vector<int> _compaction_counter;
size_tiered_compaction_strategy_options _stcs_options;
compaction_backlog_tracker _backlog_tracker;
int32_t calculate_max_sstable_size_in_mb(std::optional<sstring> option_value) const;
public:
static unsigned ideal_level_for_input(const std::vector<sstables::shared_sstable>& input, uint64_t max_sstable_size);
@@ -63,7 +64,9 @@ public:
}
virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;
virtual compaction_backlog_tracker& get_backlog_tracker() override {
return _backlog_tracker;
}
virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) override;
};

View File

@@ -82,6 +82,7 @@ public:
class size_tiered_compaction_strategy : public compaction_strategy_impl {
size_tiered_compaction_strategy_options _options;
compaction_backlog_tracker _backlog_tracker;
// Return a list of pair of shared_sstable and its respective size.
static std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables);
@@ -127,7 +128,9 @@ public:
most_interesting_bucket(const std::vector<sstables::shared_sstable>& candidates, int min_threshold, int max_threshold,
size_tiered_compaction_strategy_options options = {});
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;
virtual compaction_backlog_tracker& get_backlog_tracker() override {
return _backlog_tracker;
}
virtual compaction_descriptor get_reshaping_job(std::vector<shared_sstable> input, schema_ptr schema, const ::io_priority_class& iop, reshape_mode mode) override;

View File

@@ -15,7 +15,6 @@
#include "compaction_descriptor.hh"
class reader_permit;
class compaction_backlog_tracker;
namespace sstables {
class compaction_strategy;
@@ -44,7 +43,6 @@ public:
virtual future<> on_compaction_completion(sstables::compaction_completion_desc desc, sstables::offstrategy offstrategy) = 0;
virtual bool is_auto_compaction_disabled_by_user() const noexcept = 0;
virtual const tombstone_gc_state& get_tombstone_gc_state() const noexcept = 0;
virtual compaction_backlog_tracker& get_backlog_tracker() = 0;
};
}

View File

@@ -73,6 +73,7 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
// Keep track of all recent active windows that still need to be compacted into a single SSTable
std::unordered_set<timestamp_type> _recent_active_windows;
size_tiered_compaction_strategy_options _stcs_options;
compaction_backlog_tracker _backlog_tracker;
public:
// The maximum amount of buckets we segregate data into when writing into sstables.
// To prevent an explosion in the number of sstables we cap it.
@@ -155,7 +156,9 @@ public:
virtual std::unique_ptr<sstable_set_impl> make_sstable_set(schema_ptr schema) const override;
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() override;
virtual compaction_backlog_tracker& get_backlog_tracker() override {
return _backlog_tracker;
}
virtual uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate) override;

View File

@@ -289,8 +289,7 @@ modes = {
'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
'cxx_ld_flags': '',
'stack-usage-threshold': 1024*40,
# -fasan -Og breaks some coroutines on aarch64, use -O0 instead
'optimization-level': ('0' if platform.machine() == 'aarch64' else 'g'),
'optimization-level': 'g',
'per_src_extra_cxxflags': {},
'cmake_build_type': 'Debug',
'can_have_debug_info': True,
@@ -910,7 +909,6 @@ scylla_core = (['message/messaging_service.cc',
'utils/config_file.cc',
'utils/multiprecision_int.cc',
'utils/gz/crc_combine.cc',
'utils/gz/crc_combine_table.cc',
'gms/version_generator.cc',
'gms/versioned_value.cc',
'gms/gossiper.cc',
@@ -945,7 +943,6 @@ scylla_core = (['message/messaging_service.cc',
'locator/ec2_snitch.cc',
'locator/ec2_multi_region_snitch.cc',
'locator/gce_snitch.cc',
'locator/topology.cc',
'service/client_state.cc',
'service/storage_service.cc',
'service/misc_services.cc',
@@ -1326,6 +1323,8 @@ deps['test/raft/discovery_test'] = ['test/raft/discovery_test.cc',
'test/lib/log.cc',
'service/raft/discovery.cc'] + scylla_raft_dependencies
deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']
warnings = [
'-Wall',
@@ -1414,8 +1413,12 @@ if not has_wasmtime:
has_wasmtime = os.path.isfile('/usr/lib64/libwasmtime.a') and os.path.isdir('/usr/local/include/wasmtime')
if has_wasmtime:
for mode in modes:
modes[mode]['cxxflags'] += ' -DSCYLLA_ENABLE_WASMTIME'
if platform.machine() == 'aarch64':
print("wasmtime is temporarily not supported on aarch64. Ref: issue #9387")
has_wasmtime = False
else:
for mode in modes:
modes[mode]['cxxflags'] += ' -DSCYLLA_ENABLE_WASMTIME'
else:
print("wasmtime not found - WASM support will not be enabled in this build")
@@ -1601,6 +1604,8 @@ if args.target != '':
seastar_cflags += ' -march=' + args.target
seastar_ldflags = args.user_ldflags
libdeflate_cflags = seastar_cflags
# cmake likes to separate things with semicolons
def semicolon_separated(*flags):
# original flags may be space separated, so convert to string still
@@ -1734,7 +1739,6 @@ libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-l
maybe_static(True, '-lzstd'),
maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc -licui18n'),
'-lxxhash',
'-ldeflate',
])
if has_wasmtime:
print("Found wasmtime dependency, linking with libwasmtime")
@@ -1945,8 +1949,11 @@ with open(buildfile, 'w') as f:
f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
else:
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
'libdeflate/libdeflate.a',
] + [
'abseil/' + x for x in abseil_libs
]])
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
if binary in tests:
local_libs = '$seastar_libs_{} $libs'.format(mode)
if binary in pure_boost_tests:
@@ -1995,6 +2002,12 @@ with open(buildfile, 'w') as f:
rust_libs[staticlib] = src
else:
raise Exception('No rule for ' + src)
compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
f.write('build {}: link_build.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
f.write(' libs = $seastar_libs_{}\n'.format(mode))
f.write(
'build {mode}-objects: phony {objs}\n'.format(
@@ -2126,16 +2139,24 @@ with open(buildfile, 'w') as f:
f.write(f' mode = {mode}\n')
f.write(f'build $builddir/dist/{mode}/debian: debbuild $builddir/{mode}/dist/tar/{scylla_product}-unstripped-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
f.write(f' mode = {mode}\n')
f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian\n')
f.write(f'build dist-server-{mode}: phony $builddir/dist/{mode}/redhat $builddir/dist/{mode}/debian dist-server-compat-{mode} dist-server-compat-arch-{mode}\n')
f.write(f'build dist-server-compat-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz\n')
f.write(f'build dist-server-compat-arch-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz\n')
f.write(f'build dist-server-debuginfo-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz dist-jmx-rpm dist-jmx-deb\n')
f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz dist-tools-rpm dist-tools-deb\n')
f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb\n')
f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
f.write(f'build dist-jmx-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz dist-jmx-rpm dist-jmx-deb dist-jmx-compat\n')
f.write(f'build dist-tools-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz dist-tools-rpm dist-tools-deb dist-tools-compat\n')
f.write(f'build dist-python3-{mode}: phony dist-python3-tar dist-python3-rpm dist-python3-deb dist-python3-compat dist-python3-compat-arch\n')
f.write(f'build dist-unified-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz dist-unified-compat-{mode} dist-unified-compat-arch-{mode}\n')
f.write(f'build dist-unified-compat-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz\n')
f.write(f'build dist-unified-compat-arch-{mode}: phony $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz\n')
f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz: unified $builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz $builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz | always\n')
f.write(f' mode = {mode}\n')
f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
f.write(f'build $builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz: copy $builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz\n')
f.write('rule libdeflate.{mode}\n'.format(**locals()))
f.write(' command = make -C libdeflate BUILD_DIR=../$builddir/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../$builddir/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
f.write('build $builddir/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
f.write(' pool = submodule_pool\n')
for lib in abseil_libs:
f.write('build $builddir/{mode}/abseil/{lib}: ninja $builddir/{mode}/abseil/build.ninja\n'.format(**locals()))
@@ -2158,13 +2179,17 @@ with open(buildfile, 'w') as f:
f.write(textwrap.dedent(f'''\
build dist-unified-tar: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{scylla_version}-{scylla_release}.{arch}.tar.gz' for mode in default_modes])}
build dist-unified: phony dist-unified-tar
build dist-unified-compat: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-package-{scylla_version}-{scylla_release}.tar.gz' for mode in default_modes])}
build dist-unified-compat-arch: phony {' '.join([f'$builddir/{mode}/dist/tar/{scylla_product}-unified-{arch}-package-{scylla_version}-{scylla_release}.tar.gz' for mode in default_modes])}
build dist-unified: phony dist-unified-tar dist-unified-compat dist-unified-compat-arch
build dist-server-deb: phony {' '.join(['$builddir/dist/{mode}/debian'.format(mode=mode) for mode in build_modes])}
build dist-server-rpm: phony {' '.join(['$builddir/dist/{mode}/redhat'.format(mode=mode) for mode in build_modes])}
build dist-server-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
build dist-server-debuginfo: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-debuginfo-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
build dist-server: phony dist-server-tar dist-server-debuginfo dist-server-rpm dist-server-deb
build dist-server-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
build dist-server-compat-arch: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-{arch}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
build dist-server: phony dist-server-tar dist-server-debuginfo dist-server-compat dist-server-compat-arch dist-server-rpm dist-server-deb
rule build-submodule-reloc
command = cd $reloc_dir && ./reloc/build_reloc.sh --version $$(<../../build/SCYLLA-PRODUCT-FILE)-$$(sed 's/-/~/' <../../build/SCYLLA-VERSION-FILE)-$$(<../../build/SCYLLA-RELEASE-FILE) --nodeps $args
@@ -2182,7 +2207,8 @@ with open(buildfile, 'w') as f:
dir = tools/jmx
artifact = $builddir/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz
build dist-jmx-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-{scylla_version}-{scylla_release}.noarch.tar.gz'.format(mode=mode, scylla_product=scylla_product, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
build dist-jmx: phony dist-jmx-tar dist-jmx-rpm dist-jmx-deb
build dist-jmx-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-jmx-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
build dist-jmx: phony dist-jmx-tar dist-jmx-compat dist-jmx-rpm dist-jmx-deb
build tools/java/build/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz: build-submodule-reloc | build/SCYLLA-PRODUCT-FILE build/SCYLLA-VERSION-FILE build/SCYLLA-RELEASE-FILE
reloc_dir = tools/java
@@ -2193,7 +2219,8 @@ with open(buildfile, 'w') as f:
dir = tools/java
artifact = $builddir/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz
build dist-tools-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-{scylla_version}-{scylla_release}.noarch.tar.gz'.format(mode=mode, scylla_product=scylla_product, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
build dist-tools: phony dist-tools-tar dist-tools-rpm dist-tools-deb
build dist-tools-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-tools-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
build dist-tools: phony dist-tools-tar dist-tools-compat dist-tools-rpm dist-tools-deb
build tools/python3/build/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz: build-submodule-reloc | build/SCYLLA-PRODUCT-FILE build/SCYLLA-VERSION-FILE build/SCYLLA-RELEASE-FILE
reloc_dir = tools/python3
@@ -2205,10 +2232,14 @@ with open(buildfile, 'w') as f:
dir = tools/python3
artifact = $builddir/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz
build dist-python3-tar: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-{scylla_version}-{scylla_release}.{arch}.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch, scylla_version=scylla_version, scylla_release=scylla_release) for mode in default_modes])}
build dist-python3: phony dist-python3-tar dist-python3-rpm dist-python3-deb
build dist-python3-compat: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
build dist-python3-compat-arch: phony {' '.join(['$builddir/{mode}/dist/tar/{scylla_product}-python3-{arch}-package.tar.gz'.format(mode=mode, scylla_product=scylla_product, arch=arch) for mode in default_modes])}
build dist-python3: phony dist-python3-tar dist-python3-compat dist-python3-compat-arch dist-python3-rpm dist-python3-deb
build dist-deb: phony dist-server-deb dist-python3-deb dist-jmx-deb dist-tools-deb
build dist-rpm: phony dist-server-rpm dist-python3-rpm dist-jmx-rpm dist-tools-rpm
build dist-tar: phony dist-unified-tar dist-server-tar dist-python3-tar dist-jmx-tar dist-tools-tar
build dist-compat: phony dist-unified-compat dist-server-compat dist-python3-compat
build dist-compat-arch: phony dist-unified-compat-arch dist-server-compat-arch dist-python3-compat-arch
build dist: phony dist-unified dist-server dist-python3 dist-jmx dist-tools
'''))

View File

@@ -1419,7 +1419,7 @@ serviceLevelOrRoleName returns [sstring name]
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower); }
| t=STRING_LITERAL { $name = sstring($t.text); }
| t=QUOTED_NAME { $name = sstring($t.text); }
| k=unreserved_keyword { $name = k;
| k=unreserved_keyword { $name = sstring($t.text);
std::transform($name.begin(), $name.end(), $name.begin(), ::tolower);}
| QMARK {add_recognition_error("Bind variables cannot be used for service levels or role names");}
;

View File

@@ -216,95 +216,36 @@ get_value(const subscript& s, const evaluation_inputs& inputs) {
}
}
// This class represents a value that can be one of three things:
// false, true or null.
// It could be represented by std::optional<bool>, but optional
// can be implicitly casted to bool, which might cause mistakes.
// (bool)(std::make_optional<bool>(false)) will return true,
// despite the fact that the represented value is `false`.
// To avoid any such problems this class is introduced
// along with the is_true() method, which can be used
// to check if the value held is indeed `true`.
class bool_or_null {
std::optional<bool> value;
public:
bool_or_null(bool val) : value(val) {}
bool_or_null(null_value) : value(std::nullopt) {}
static bool_or_null null() {
return bool_or_null(null_value{});
}
bool has_value() const {
return value.has_value();
}
bool is_null() const {
return !has_value();
}
const bool& get_value() const {
return *value;
}
const bool is_true() const {
return has_value() && get_value();
}
};
/// True iff lhs's value equals rhs.
bool_or_null equal(const expression& lhs, const managed_bytes_opt& rhs_bytes, const evaluation_inputs& inputs) {
raw_value lhs_value = evaluate(lhs, inputs);
if (lhs_value.is_unset_value()) {
throw exceptions::invalid_request_exception("unset value found on left-hand side of an equality operator");
bool equal(const expression& lhs, const managed_bytes_opt& rhs, const evaluation_inputs& inputs) {
if (!rhs) {
return false;
}
if (lhs_value.is_null() || !rhs_bytes.has_value()) {
return bool_or_null::null();
const auto value = evaluate(lhs, inputs).to_managed_bytes_opt();
if (!value) {
return false;
}
managed_bytes lhs_bytes = std::move(lhs_value).to_managed_bytes();
return type_of(lhs)->equal(managed_bytes_view(lhs_bytes), managed_bytes_view(*rhs_bytes));
return type_of(lhs)->equal(managed_bytes_view(*value), managed_bytes_view(*rhs));
}
static std::optional<std::pair<managed_bytes, managed_bytes>> evaluate_binop_sides(const expression& lhs,
const expression& rhs,
const oper_t op,
const evaluation_inputs& inputs) {
raw_value lhs_value = evaluate(lhs, inputs);
raw_value rhs_value = evaluate(rhs, inputs);
/// Convenience overload for expression.
bool equal(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
return equal(lhs, evaluate(rhs, inputs).to_managed_bytes_opt(), inputs);
}
if (lhs_value.is_unset_value()) {
/// True iff columns' values equal t.
bool equal(const tuple_constructor& columns_tuple_lhs, const expression& t_rhs, const evaluation_inputs& inputs) {
const cql3::raw_value tup = evaluate(t_rhs, inputs);
const auto& rhs = get_tuple_elements(tup, *type_of(t_rhs));
if (rhs.size() != columns_tuple_lhs.elements.size()) {
throw exceptions::invalid_request_exception(
format("unset value found on left-hand side of a binary operator with operation {}", op));
format("tuple equality size mismatch: {} elements on left-hand side, {} on right",
columns_tuple_lhs.elements.size(), rhs.size()));
}
if (rhs_value.is_unset_value()) {
throw exceptions::invalid_request_exception(
format("unset value found on right-hand side of a binary operator with operation {}", op));
}
if (lhs_value.is_null() || rhs_value.is_null()) {
return std::nullopt;
}
managed_bytes lhs_bytes = std::move(lhs_value).to_managed_bytes();
managed_bytes rhs_bytes = std::move(rhs_value).to_managed_bytes();
return std::pair(std::move(lhs_bytes), std::move(rhs_bytes));
}
bool_or_null equal(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, oper_t::EQ, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
}
auto [lhs_bytes, rhs_bytes] = std::move(*sides_bytes);
return type_of(lhs)->equal(managed_bytes_view(lhs_bytes), managed_bytes_view(rhs_bytes));
}
bool_or_null not_equal(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, oper_t::NEQ, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
}
auto [lhs_bytes, rhs_bytes] = std::move(*sides_bytes);
return !type_of(lhs)->equal(managed_bytes_view(lhs_bytes), managed_bytes_view(rhs_bytes));
return boost::equal(columns_tuple_lhs.elements, rhs,
[&] (const expression& lhs, const managed_bytes_opt& b) {
return equal(lhs, b, inputs);
});
}
/// True iff lhs is limited by rhs in the manner prescribed by op.
@@ -329,77 +270,127 @@ bool limits(managed_bytes_view lhs, oper_t op, managed_bytes_view rhs, const abs
}
/// True iff the column value is limited by rhs in the manner prescribed by op.
bool_or_null limits(const expression& lhs, oper_t op, const expression& rhs, const evaluation_inputs& inputs) {
bool limits(const expression& col, oper_t op, const expression& rhs, const evaluation_inputs& inputs) {
if (!is_slice(op)) { // For EQ or NEQ, use equal().
throw std::logic_error("limits() called on non-slice op");
}
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, op, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
auto lhs = evaluate(col, inputs).to_managed_bytes_opt();
if (!lhs) {
return false;
}
auto [lhs_bytes, rhs_bytes] = std::move(*sides_bytes);
const auto b = evaluate(rhs, inputs).to_managed_bytes_opt();
return b ? limits(*lhs, op, *b, type_of(col)->without_reversed()) : false;
}
return limits(lhs_bytes, op, rhs_bytes, type_of(lhs)->without_reversed());
/// True iff the column values are limited by t in the manner prescribed by op.
bool limits(const tuple_constructor& columns_tuple, const oper_t op, const expression& e,
const evaluation_inputs& inputs) {
if (!is_slice(op)) { // For EQ or NEQ, use equal().
throw std::logic_error("limits() called on non-slice op");
}
const cql3::raw_value tup = evaluate(e, inputs);
const auto& rhs = get_tuple_elements(tup, *type_of(e));
if (rhs.size() != columns_tuple.elements.size()) {
throw exceptions::invalid_request_exception(
format("tuple comparison size mismatch: {} elements on left-hand side, {} on right",
columns_tuple.elements.size(), rhs.size()));
}
for (size_t i = 0; i < rhs.size(); ++i) {
auto& cv = columns_tuple.elements[i];
auto lhs = evaluate(cv, inputs).to_managed_bytes_opt();
if (!lhs || !rhs[i]) {
// CQL dictates that columns_tuple.elements[i] is a clustering column and non-null, but
// let's not rely on grammar constraints that can be later relaxed.
//
// NULL = always fails comparison
return false;
}
const auto cmp = type_of(cv)->without_reversed().compare(
*lhs,
*rhs[i]);
// If the components aren't equal, then we just learned the LHS/RHS order.
if (cmp < 0) {
if (op == oper_t::LT || op == oper_t::LTE) {
return true;
} else if (op == oper_t::GT || op == oper_t::GTE) {
return false;
} else {
throw std::logic_error("Unknown slice operator");
}
} else if (cmp > 0) {
if (op == oper_t::LT || op == oper_t::LTE) {
return false;
} else if (op == oper_t::GT || op == oper_t::GTE) {
return true;
} else {
throw std::logic_error("Unknown slice operator");
}
}
// Otherwise, we don't know the LHS/RHS order, so check the next component.
}
// Getting here means LHS == RHS.
return op == oper_t::LTE || op == oper_t::GTE;
}
/// True iff collection (list, set, or map) contains value.
bool contains(const data_value& collection, const raw_value_view& value) {
if (!value) {
// CONTAINS NULL should evaluate to NULL/false
return false;
}
auto col_type = static_pointer_cast<const collection_type_impl>(collection.type());
auto&& element_type = col_type->is_set() ? col_type->name_comparator() : col_type->value_comparator();
return value.with_linearized([&] (bytes_view val) {
auto exists_in = [&](auto&& range) {
auto found = std::find_if(range.begin(), range.end(), [&] (auto&& element) {
return element_type->compare(element.serialize_nonnull(), val) == 0;
});
return found != range.end();
};
if (col_type->is_list()) {
return exists_in(value_cast<list_type_impl::native_type>(collection));
} else if (col_type->is_set()) {
return exists_in(value_cast<set_type_impl::native_type>(collection));
} else if (col_type->is_map()) {
auto data_map = value_cast<map_type_impl::native_type>(collection);
using entry = std::pair<data_value, data_value>;
return exists_in(data_map | transformed([] (const entry& e) { return e.second; }));
} else {
throw std::logic_error("unsupported collection type in a CONTAINS expression");
}
});
}
/// True iff a column is a collection containing value.
bool_or_null contains(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, oper_t::CONTAINS, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
}
const abstract_type& lhs_type = type_of(lhs)->without_reversed();
data_value lhs_collection = lhs_type.deserialize(managed_bytes_view(sides_bytes->first));
const collection_type_impl* collection_type = dynamic_cast<const collection_type_impl*>(&lhs_type);
data_type element_type =
collection_type->is_set() ? collection_type->name_comparator() : collection_type->value_comparator();
auto exists_in = [&](auto&& range) {
auto found = std::find_if(range.begin(), range.end(), [&](auto&& element) {
return element_type->compare(managed_bytes_view(element.serialize_nonnull()), sides_bytes->second) == 0;
});
return found != range.end();
};
if (collection_type->is_list()) {
return exists_in(value_cast<list_type_impl::native_type>(lhs_collection));
} else if (collection_type->is_set()) {
return exists_in(value_cast<set_type_impl::native_type>(lhs_collection));
} else if (collection_type->is_map()) {
auto data_map = value_cast<map_type_impl::native_type>(lhs_collection);
using entry = std::pair<data_value, data_value>;
return exists_in(data_map | transformed([](const entry& e) { return e.second; }));
bool contains(const column_value& col, const raw_value_view& value, const evaluation_inputs& inputs) {
const auto collection = get_value(col, inputs);
if (collection) {
return contains(col.col->type->deserialize(managed_bytes_view(*collection)), value);
} else {
on_internal_error(expr_logger, "unsupported collection type in a CONTAINS expression");
return false;
}
}
/// True iff a column is a map containing \p key.
bool_or_null contains_key(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, oper_t::CONTAINS_KEY, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
bool contains_key(const column_value& col, cql3::raw_value_view key, const evaluation_inputs& inputs) {
if (!key) {
// CONTAINS_KEY NULL should evaluate to NULL/false
return false;
}
auto [lhs_bytes, rhs_bytes] = std::move(*sides_bytes);
data_type lhs_type = type_of(lhs);
const map_type_impl::native_type data_map =
value_cast<map_type_impl::native_type>(lhs_type->deserialize(managed_bytes_view(lhs_bytes)));
data_type key_type = static_pointer_cast<const collection_type_impl>(lhs_type)->name_comparator();
for (const std::pair<data_value, data_value>& map_element : data_map) {
bytes serialized_element_key = map_element.first.serialize_nonnull();
if (key_type->compare(managed_bytes_view(rhs_bytes), managed_bytes_view(bytes_view(serialized_element_key))) ==
0) {
return true;
};
auto type = col.col->type;
const auto collection = get_value(col, inputs);
if (!collection) {
return false;
}
return false;
const auto data_map = value_cast<map_type_impl::native_type>(type->deserialize(managed_bytes_view(*collection)));
auto key_type = static_pointer_cast<const collection_type_impl>(type)->name_comparator();
auto found = key.with_linearized([&] (bytes_view k_bv) {
using entry = std::pair<data_value, data_value>;
return std::find_if(data_map.begin(), data_map.end(), [&] (const entry& element) {
return key_type->compare(element.first.serialize_nonnull(), k_bv) == 0;
});
});
return found != data_map.end();
}
/// Fetches the next cell value from iter and returns its (possibly null) value.
@@ -448,62 +439,44 @@ std::vector<managed_bytes_opt> get_non_pk_values(const selection& selection, con
namespace {
/// True iff cv matches the CQL LIKE pattern.
bool_or_null like(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
data_type lhs_type = type_of(lhs)->underlying_type();
if (!lhs_type->is_string()) {
expression::printer lhs_printer {
.expr_to_print = lhs,
.debug_mode = false
};
bool like(const column_value& cv, const raw_value_view& pattern, const evaluation_inputs& inputs) {
if (!cv.col->type->is_string()) {
throw exceptions::invalid_request_exception(
format("LIKE is allowed only on string types, which {} is not", lhs_printer));
format("LIKE is allowed only on string types, which {} is not", cv.col->name_as_text()));
}
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, oper_t::LIKE, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
auto value = get_value(cv, inputs);
// TODO: reuse matchers.
if (pattern && value) {
return value->with_linearized([&pattern] (bytes_view linearized_value) {
return pattern.with_linearized([linearized_value] (bytes_view linearized_pattern) {
return like_matcher(linearized_pattern)(linearized_value);
});
});
} else {
return false;
}
auto [lhs_managed_bytes, rhs_managed_bytes] = std::move(*sides_bytes);
bytes lhs_bytes = to_bytes(lhs_managed_bytes);
bytes rhs_bytes = to_bytes(rhs_managed_bytes);
return like_matcher(bytes_view(rhs_bytes))(bytes_view(lhs_bytes));
}
/// True iff the column value is in the set defined by rhs.
bool_or_null is_one_of(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
std::optional<std::pair<managed_bytes, managed_bytes>> sides_bytes =
evaluate_binop_sides(lhs, rhs, oper_t::IN, inputs);
if (!sides_bytes.has_value()) {
return bool_or_null::null();
bool is_one_of(const expression& col, const expression& rhs, const evaluation_inputs& inputs) {
const cql3::raw_value in_list = evaluate(rhs, inputs);
if (in_list.is_null()) {
return false;
}
auto [lhs_bytes, rhs_bytes] = std::move(*sides_bytes);
expression lhs_constant = constant(raw_value::make_value(std::move(lhs_bytes)), type_of(lhs));
utils::chunked_vector<managed_bytes> list_elems = get_list_elements(raw_value::make_value(std::move(rhs_bytes)));
for (const managed_bytes& elem : list_elems) {
if (equal(lhs_constant, elem, evaluation_inputs{}).is_true()) {
return true;
}
}
return false;
return boost::algorithm::any_of(get_list_elements(in_list), [&] (const managed_bytes_opt& b) {
return equal(col, b, inputs);
});
}
bool is_not_null(const expression& lhs, const expression& rhs, const evaluation_inputs& inputs) {
cql3::raw_value lhs_val = evaluate(lhs, inputs);
if (lhs_val.is_unset_value()) {
throw exceptions::invalid_request_exception("unset value found on left hand side of IS NOT operator");
}
cql3::raw_value rhs_val = evaluate(rhs, inputs);
if (rhs_val.is_unset_value()) {
throw exceptions::invalid_request_exception("unset value found on right hand side of IS NOT operator");
}
if (!rhs_val.is_null()) {
throw exceptions::invalid_request_exception("IS NOT operator accepts only NULL as its right side");
}
return !lhs_val.is_null();
/// True iff the tuple of column values is in the set defined by rhs.
bool is_one_of(const tuple_constructor& tuple, const expression& rhs, const evaluation_inputs& inputs) {
cql3::raw_value in_list = evaluate(rhs, inputs);
return boost::algorithm::any_of(get_list_of_tuples_elements(in_list, *type_of(rhs)), [&] (const std::vector<managed_bytes_opt>& el) {
return boost::equal(tuple.elements, el, [&] (const expression& c, const managed_bytes_opt& b) {
return equal(c, b, inputs);
});
});
}
const value_set empty_value_set = value_list{};
@@ -538,30 +511,105 @@ value_set intersection(value_set a, value_set b, const abstract_type* type) {
}
bool is_satisfied_by(const binary_operator& opr, const evaluation_inputs& inputs) {
if (is<token>(opr.lhs)) {
// The RHS value was already used to ensure we fetch only rows in the specified
// token range. It is impossible for any fetched row not to match now.
// When token restrictions are present we forbid all other restrictions on partition key.
// This means that the partition range is defined solely by restrictions on token.
// When is_satisifed_by is used by filtering we can be sure that the token restrictions
// are fulfilled. In the future it will be possible to evaluate() a token,
// and we will be able to get rid of this risky if.
return true;
}
raw_value binop_eval_result = evaluate(opr, inputs);
if (binop_eval_result.is_null()) {
return false;
}
if (binop_eval_result.is_unset_value()) {
on_internal_error(expr_logger, format("is_satisfied_by: binary operator evaluated to unset value: {}", opr));
}
if (binop_eval_result.is_empty_value()) {
on_internal_error(expr_logger, format("is_satisfied_by: binary operator evaluated to EMPTY_VALUE: {}", opr));
}
return binop_eval_result.view().deserialize<bool>(*boolean_type);
return expr::visit(overloaded_functor{
[&] (const column_value& col) {
if (opr.op == oper_t::EQ) {
return equal(col, opr.rhs, inputs);
} else if (opr.op == oper_t::NEQ) {
return !equal(col, opr.rhs, inputs);
} else if (is_slice(opr.op)) {
return limits(col, opr.op, opr.rhs, inputs);
} else if (opr.op == oper_t::CONTAINS) {
cql3::raw_value val = evaluate(opr.rhs, inputs);
return contains(col, val.view(), inputs);
} else if (opr.op == oper_t::CONTAINS_KEY) {
cql3::raw_value val = evaluate(opr.rhs, inputs);
return contains_key(col, val.view(), inputs);
} else if (opr.op == oper_t::LIKE) {
cql3::raw_value val = evaluate(opr.rhs, inputs);
return like(col, val.view(), inputs);
} else if (opr.op == oper_t::IN) {
return is_one_of(col, opr.rhs, inputs);
} else {
throw exceptions::unsupported_operation_exception(format("Unhandled binary_operator: {}", opr));
}
},
[&] (const subscript& sub) {
if (opr.op == oper_t::EQ) {
return equal(sub, opr.rhs, inputs);
} else if (opr.op == oper_t::NEQ) {
return !equal(sub, opr.rhs, inputs);
} else if (is_slice(opr.op)) {
return limits(sub, opr.op, opr.rhs, inputs);
} else if (opr.op == oper_t::CONTAINS) {
throw exceptions::unsupported_operation_exception("CONTAINS lhs is subscripted");
} else if (opr.op == oper_t::CONTAINS_KEY) {
throw exceptions::unsupported_operation_exception("CONTAINS KEY lhs is subscripted");
} else if (opr.op == oper_t::LIKE) {
throw exceptions::unsupported_operation_exception("LIKE lhs is subscripted");
} else if (opr.op == oper_t::IN) {
return is_one_of(sub, opr.rhs, inputs);
} else {
throw exceptions::unsupported_operation_exception(format("Unhandled binary_operator: {}", opr));
}
},
[&] (const tuple_constructor& cvs) {
if (opr.op == oper_t::EQ) {
return equal(cvs, opr.rhs, inputs);
} else if (is_slice(opr.op)) {
return limits(cvs, opr.op, opr.rhs, inputs);
} else if (opr.op == oper_t::IN) {
return is_one_of(cvs, opr.rhs, inputs);
} else {
throw exceptions::unsupported_operation_exception(
format("Unhandled multi-column binary_operator: {}", opr));
}
},
[] (const token& tok) -> bool {
// The RHS value was already used to ensure we fetch only rows in the specified
// token range. It is impossible for any fetched row not to match now.
return true;
},
[] (const constant&) -> bool {
on_internal_error(expr_logger, "is_satisfied_by: A constant cannot serve as the LHS of a binary expression");
},
[] (const conjunction&) -> bool {
on_internal_error(expr_logger, "is_satisfied_by: a conjunction cannot serve as the LHS of a binary expression");
},
[] (const binary_operator&) -> bool {
on_internal_error(expr_logger, "is_satisfied_by: binary operators cannot be nested");
},
[] (const unresolved_identifier&) -> bool {
on_internal_error(expr_logger, "is_satisfied_by: an unresolved identifier cannot serve as the LHS of a binary expression");
},
[] (const column_mutation_attribute&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: column_mutation_attribute cannot serve as the LHS of a binary expression");
},
[] (const function_call&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: function_call cannot serve as the LHS of a binary expression");
},
[] (const cast&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: cast cannot serve as the LHS of a binary expression");
},
[] (const field_selection&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: field_selection cannot serve as the LHS of a binary expression");
},
[] (const null&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: null cannot serve as the LHS of a binary expression");
},
[] (const bind_variable&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: bind_variable cannot serve as the LHS of a binary expression");
},
[] (const untyped_constant&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: untyped_constant cannot serve as the LHS of a binary expression");
},
[] (const collection_constructor&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: collection_constructor cannot serve as the LHS of a binary expression");
},
[] (const usertype_constructor&) -> bool {
on_internal_error(expr_logger, "is_satisified_by: usertype_constructor cannot serve as the LHS of a binary expression");
},
}, opr.lhs);
}
} // anonymous namespace
@@ -1675,54 +1723,10 @@ std::optional<bool> get_bool_value(const constant& constant_val) {
return constant_val.view().deserialize<bool>(*boolean_type);
}
cql3::raw_value evaluate(const binary_operator& binop, const evaluation_inputs& inputs) {
if (binop.order == comparison_order::clustering) {
throw exceptions::invalid_request_exception("Can't evaluate a binary operator with SCYLLA_CLUSTERING_BOUND");
}
bool_or_null binop_result(false);
switch (binop.op) {
case oper_t::EQ:
binop_result = equal(binop.lhs, binop.rhs, inputs);
break;
case oper_t::NEQ:
binop_result = not_equal(binop.lhs, binop.rhs, inputs);
break;
case oper_t::LT:
case oper_t::LTE:
case oper_t::GT:
case oper_t::GTE:
binop_result = limits(binop.lhs, binop.op, binop.rhs, inputs);
break;
case oper_t::CONTAINS:
binop_result = contains(binop.lhs, binop.rhs, inputs);
break;
case oper_t::CONTAINS_KEY:
binop_result = contains_key(binop.lhs, binop.rhs, inputs);
break;
case oper_t::LIKE:
binop_result = like(binop.lhs, binop.rhs, inputs);
break;
case oper_t::IN:
binop_result = is_one_of(binop.lhs, binop.rhs, inputs);
break;
case oper_t::IS_NOT:
binop_result = is_not_null(binop.lhs, binop.rhs, inputs);
break;
};
if (binop_result.is_null()) {
return raw_value::make_null();
}
return raw_value::make_value(boolean_type->decompose(binop_result.get_value()));
}
cql3::raw_value evaluate(const expression& e, const evaluation_inputs& inputs) {
return expr::visit(overloaded_functor {
[&](const binary_operator& binop) -> cql3::raw_value {
return evaluate(binop, inputs);
[](const binary_operator&) -> cql3::raw_value {
on_internal_error(expr_logger, "Can't evaluate a binary_operator");
},
[](const conjunction&) -> cql3::raw_value {
on_internal_error(expr_logger, "Can't evaluate a conjunction");

View File

@@ -246,21 +246,6 @@ map_prepare_expression(const collection_constructor& c, data_dictionary::databas
auto key_spec = maps::key_spec_of(*receiver);
auto value_spec = maps::value_spec_of(*receiver);
const map_type_impl* map_type = dynamic_cast<const map_type_impl*>(&receiver->type->without_reversed());
if (map_type == nullptr) {
on_internal_error(expr_logger,
format("map_prepare_expression bad non-map receiver type: {}", receiver->type->name()));
}
data_type map_element_tuple_type = tuple_type_impl::get_instance({map_type->get_keys_type(), map_type->get_values_type()});
// In Cassandra, an empty (unfrozen) map/set/list is equivalent to the column being null. In
// other words a non-frozen collection only exists if it has elements. Return nullptr right
// away to simplify predicate evaluation. See also
// https://issues.apache.org/jira/browse/CASSANDRA-5141
if (map_type->is_multi_cell() && c.elements.empty()) {
return constant::make_null(receiver->type);
}
std::vector<expression> values;
values.reserve(c.elements.size());
bool all_terminal = true;
@@ -279,7 +264,7 @@ map_prepare_expression(const collection_constructor& c, data_dictionary::databas
values.emplace_back(tuple_constructor {
.elements = {std::move(k), std::move(v)},
.type = map_element_tuple_type
.type = entry_tuple.type
});
}
@@ -702,13 +687,9 @@ bind_variable_test_assignment(const bind_variable& bv, data_dictionary::database
}
static
std::optional<bind_variable>
bind_variable
bind_variable_prepare_expression(const bind_variable& bv, data_dictionary::database db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver)
{
if (!receiver) {
return std::nullopt;
}
return bind_variable {
.bind_index = bv.bind_index,
.receiver = receiver

View File

@@ -777,19 +777,6 @@ bool statement_restrictions::has_unrestricted_clustering_columns() const {
return clustering_columns_restrictions_size() < _schema->clustering_key_size();
}
const column_definition& statement_restrictions::unrestricted_column(column_kind kind) const {
const auto& restrictions = get_restrictions(kind);
const auto sorted_cols = expr::get_sorted_column_defs(restrictions);
for (size_t i = 0, count = _schema->columns_count(kind); i < count; ++i) {
if (i >= sorted_cols.size() || sorted_cols[i]->component_index() != i) {
return _schema->column_at(kind, i);
}
}
on_internal_error(rlogger, format(
"no missing columns with kind {} found in expression {}",
to_sstring(kind), restrictions));
};
bool statement_restrictions::clustering_columns_restrictions_have_supporting_index(
const secondary_index::secondary_index_manager& index_manager,
expr::allow_local_index allow_local) const {
@@ -1942,28 +1929,15 @@ sstring statement_restrictions::to_string() const {
return _where ? expr::to_string(*_where) : "";
}
static void validate_primary_key_restrictions(const query_options& options, const std::vector<expr::expression>& restrictions) {
for (const auto& r: restrictions) {
for_each_expression<binary_operator>(r, [&](const binary_operator& binop) {
if (binop.op != oper_t::EQ && binop.op != oper_t::IN) {
return;
}
const auto* c = as_if<column_value>(&binop.lhs);
if (!c) {
return;
}
if (evaluate(binop.rhs, options).is_null()) {
throw exceptions::invalid_request_exception(format("Invalid null value in condition for column {}",
c->col->name_as_text()));
}
});
}
static bool has_eq_null(const query_options& options, const expression& expr) {
return find_binop(expr, [&] (const binary_operator& binop) {
return binop.op == oper_t::EQ && evaluate(binop.rhs, options).is_null();
});
}
void statement_restrictions::validate_primary_key(const query_options& options) const {
validate_primary_key_restrictions(options, _partition_range_restrictions);
validate_primary_key_restrictions(options, _clustering_prefix_restrictions);
bool statement_restrictions::range_or_slice_eq_null(const query_options& options) const {
return boost::algorithm::any_of(_partition_range_restrictions, std::bind_front(has_eq_null, std::cref(options)))
|| boost::algorithm::any_of(_clustering_prefix_restrictions, std::bind_front(has_eq_null, std::cref(options)));
}
} // namespace restrictions
} // namespace cql3

View File

@@ -240,15 +240,6 @@ public:
* @return <code>true</code> if the clustering key has some unrestricted components, <code>false</code> otherwise.
*/
bool has_unrestricted_clustering_columns() const;
/**
* Returns the first unrestricted column for restrictions of the specified kind.
* It's an error to call this function if there are no such columns.
*
* @param kind supported values are column_kind::partition_key and column_kind::clustering_key;
* @return the <code>column_definition</code> for the unrestricted column.
*/
const column_definition& unrestricted_column(column_kind kind) const;
private:
void add_restriction(const expr::binary_operator& restr, schema_ptr schema, bool allow_filtering, bool for_view);
void add_is_not_restriction(const expr::binary_operator& restr, schema_ptr schema, bool for_view);
@@ -534,8 +525,8 @@ public:
sstring to_string() const;
/// Checks that the primary key restrictions don't contain null values, throws invalid_request_exception otherwise.
void validate_primary_key(const query_options& options) const;
/// True iff the partition range or slice is empty specifically due to a =NULL restriction.
bool range_or_slice_eq_null(const query_options& options) const;
};
}

View File

@@ -435,7 +435,7 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
clustering_key_prefix ckey = clustering_key_prefix::from_exploded(clustering_key);
// FIXME: push to upper layer so it happens once per row
auto static_and_regular_columns = expr::get_non_pk_values(selection, static_row, row);
bool multi_col_clustering_satisfied = expr::is_satisfied_by(
return expr::is_satisfied_by(
clustering_columns_restrictions,
expr::evaluation_inputs{
.partition_key = &partition_key,
@@ -444,9 +444,6 @@ bool result_set_builder::restrictions_filter::do_filter(const selection& selecti
.selection = &selection,
.options = &_options,
});
if (!multi_col_clustering_satisfied) {
return false;
}
}
auto static_row_iterator = static_row.iterator();

View File

@@ -261,10 +261,6 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
if (options.getSerialConsistency() == null)
throw new InvalidRequestException("Invalid empty serial consistency level");
#endif
for (size_t i = 0; i < _statements.size(); ++i) {
_statements[i].statement->restrictions().validate_primary_key(options.for_statement(i));
}
if (_has_conditions) {
++_stats.cas_batches;
_stats.statements_in_cas_batches += _statements.size();

View File

@@ -61,8 +61,8 @@ static std::map<sstring, sstring> prepare_options(
}
}
for (const auto& dc : tm.get_topology().get_datacenters()) {
options.emplace(dc, rf);
for (const auto& dc : tm.get_topology().get_datacenter_endpoints()) {
options.emplace(dc.first, rf);
}
}

View File

@@ -112,6 +112,9 @@ future<> modification_statement::check_access(query_processor& qp, const service
future<std::vector<mutation>>
modification_statement::get_mutations(query_processor& qp, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, service::query_state& qs) const {
if (_restrictions->range_or_slice_eq_null(options)) { // See #7852 and #9290.
throw exceptions::invalid_request_exception("Invalid null value in condition for a key column");
}
auto cl = options.get_consistency();
auto json_cache = maybe_prepare_json_cache(options);
auto keys = build_partition_keys(options, json_cache);
@@ -260,8 +263,6 @@ modification_statement::do_execute(query_processor& qp, service::query_state& qs
inc_cql_stats(qs.get_client_state().is_internal());
_restrictions->validate_primary_key(options);
if (has_conditions()) {
return execute_with_condition(qp, qs, options);
}
@@ -417,23 +418,24 @@ modification_statement::process_where_clause(data_dictionary::database db, expr:
// Those tables don't have clustering columns so we wouldn't reach this code, thus
// the check seems redundant.
if (require_full_clustering_key()) {
throw exceptions::invalid_request_exception(format("Missing mandatory PRIMARY KEY part {}",
_restrictions->unrestricted_column(column_kind::clustering_key).name_as_text()));
auto& col = s->column_at(column_kind::clustering_key, _restrictions->clustering_columns_restrictions_size());
throw exceptions::invalid_request_exception(format("Missing mandatory PRIMARY KEY part {}", col.name_as_text()));
}
// In general, we can't modify specific columns if not all clustering columns have been specified.
// However, if we modify only static columns, it's fine since we won't really use the prefix anyway.
if (!has_slice(ck_restrictions)) {
auto& col = s->column_at(column_kind::clustering_key, _restrictions->clustering_columns_restrictions_size());
for (auto&& op : _column_operations) {
if (!op->column.is_static()) {
throw exceptions::invalid_request_exception(format("Primary key column '{}' must be specified in order to modify column '{}'",
_restrictions->unrestricted_column(column_kind::clustering_key).name_as_text(), op->column.name_as_text()));
col.name_as_text(), op->column.name_as_text()));
}
}
}
}
if (_restrictions->has_partition_key_unrestricted_components()) {
throw exceptions::invalid_request_exception(format("Missing mandatory PRIMARY KEY part {}",
_restrictions->unrestricted_column(column_kind::partition_key).name_as_text()));
auto& col = s->column_at(column_kind::partition_key, _restrictions->partition_key_restrictions_size());
throw exceptions::invalid_request_exception(format("Missing mandatory PRIMARY KEY part {}", col.name_as_text()));
}
if (has_conditions()) {
validate_where_clause_for_conditions();

View File

@@ -655,58 +655,68 @@ indexed_table_select_statement::do_execute_base_query(
auto cmd = prepare_command_for_base_query(qp, options, state, now, bool(paging_state));
auto timeout = db::timeout_clock::now() + get_timeout(state.get_client_state(), options);
query::result_merger merger(cmd->get_row_limit(), query::max_partitions);
std::vector<primary_key> keys = std::move(primary_keys);
std::vector<primary_key>::iterator key_it(keys.begin());
size_t previous_result_size = 0;
size_t next_iteration_size = 0;
struct base_query_state {
query::result_merger merger;
std::vector<primary_key> primary_keys;
std::vector<primary_key>::iterator current_primary_key;
size_t previous_result_size = 0;
size_t next_iteration_size = 0;
base_query_state(uint64_t row_limit, std::vector<primary_key>&& keys)
: merger(row_limit, query::max_partitions)
, primary_keys(std::move(keys))
, current_primary_key(primary_keys.begin())
{}
base_query_state(base_query_state&&) = default;
base_query_state(const base_query_state&) = delete;
};
base_query_state query_state{cmd->get_row_limit(), std::move(primary_keys)};
const bool is_paged = bool(paging_state);
while (key_it != keys.end()) {
// Starting with 1 key, we check if the result was a short read, and if not,
// we continue exponentially, asking for 2x more key than before
auto already_done = std::distance(keys.begin(), key_it);
// If the previous result already provided 1MB worth of data,
// stop increasing the number of fetched partitions
if (previous_result_size < query::result_memory_limiter::maximum_result_size) {
next_iteration_size = already_done + 1;
}
next_iteration_size = std::min<size_t>({next_iteration_size, keys.size() - already_done, max_base_table_query_concurrency});
auto key_it_end = key_it + next_iteration_size;
auto command = ::make_lw_shared<query::read_command>(*cmd);
query::result_merger oneshot_merger(cmd->get_row_limit(), query::max_partitions);
coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>> rresult = co_await utils::result_map_reduce(key_it, key_it_end, coroutine::lambda([&] (auto& key)
-> future<coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>>> {
return do_with(std::move(query_state), [this, is_paged, &qp, &state, &options, cmd, timeout] (auto&& query_state) {
auto &merger = query_state.merger;
auto &keys = query_state.primary_keys;
auto &key_it = query_state.current_primary_key;
auto &previous_result_size = query_state.previous_result_size;
auto &next_iteration_size = query_state.next_iteration_size;
return utils::result_repeat([this, is_paged, &previous_result_size, &next_iteration_size, &keys, &key_it, &merger, &qp, &state, &options, cmd, timeout]() {
// Starting with 1 key, we check if the result was a short read, and if not,
// we continue exponentially, asking for 2x more key than before
auto already_done = std::distance(keys.begin(), key_it);
// If the previous result already provided 1MB worth of data,
// stop increasing the number of fetched partitions
if (previous_result_size < query::result_memory_limiter::maximum_result_size) {
next_iteration_size = already_done + 1;
}
next_iteration_size = std::min<size_t>({next_iteration_size, keys.size() - already_done, max_base_table_query_concurrency});
auto key_it_end = key_it + next_iteration_size;
auto command = ::make_lw_shared<query::read_command>(*cmd);
// for each partition, read just one clustering row (TODO: can
// get all needed rows of one partition at once.)
command->slice._row_ranges.clear();
if (key.clustering) {
command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
}
coordinator_result<service::storage_proxy::coordinator_query_result> rqr
= co_await qp.proxy().query_result(_schema, command, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(), {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state()});
if (!rqr.has_value()) {
co_return std::move(rqr).as_failure();
}
co_return std::move(rqr.value().query_result);
}), std::move(oneshot_merger));
if (!rresult.has_value()) {
co_return std::move(rresult).as_failure();
}
auto& result = rresult.value();
auto is_short_read = result->is_short_read();
// Results larger than 1MB should be shipped to the client immediately
const bool page_limit_reached = is_paged && result->buf().size() >= query::result_memory_limiter::maximum_result_size;
previous_result_size = result->buf().size();
merger(std::move(result));
key_it = key_it_end;
if (is_short_read || page_limit_reached) {
break;
}
}
co_return value_type(merger.get(), std::move(cmd));
query::result_merger oneshot_merger(cmd->get_row_limit(), query::max_partitions);
return utils::result_map_reduce(key_it, key_it_end, [this, &qp, &state, &options, cmd, timeout] (auto& key) {
auto command = ::make_lw_shared<query::read_command>(*cmd);
// for each partition, read just one clustering row (TODO: can
// get all needed rows of one partition at once.)
command->slice._row_ranges.clear();
if (key.clustering) {
command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
}
return qp.proxy().query_result(_schema, command, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(), {timeout, state.get_permit(), state.get_client_state(), state.get_trace_state()})
.then(utils::result_wrap([] (service::storage_proxy::coordinator_query_result qr) -> coordinator_result<foreign_ptr<lw_shared_ptr<query::result>>> {
return std::move(qr.query_result);
}));
}, std::move(oneshot_merger)).then(utils::result_wrap([is_paged, &previous_result_size, &key_it, key_it_end = std::move(key_it_end), &keys, &merger] (foreign_ptr<lw_shared_ptr<query::result>> result) -> coordinator_result<stop_iteration> {
auto is_short_read = result->is_short_read();
// Results larger than 1MB should be shipped to the client immediately
const bool page_limit_reached = is_paged && result->buf().size() >= query::result_memory_limiter::maximum_result_size;
previous_result_size = result->buf().size();
merger(std::move(result));
key_it = key_it_end;
return stop_iteration(is_short_read || key_it == keys.end() || page_limit_reached);
}));
}).then(utils::result_wrap([&merger, cmd] () mutable {
return make_ready_future<coordinator_result<value_type>>(value_type(merger.get(), std::move(cmd)));
}));
});
}
future<shared_ptr<cql_transport::messages::result_message>>

View File

@@ -824,7 +824,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, view_building(this, "view_building", value_status::Used, true, "Enable view building; should only be set to false when the node is experience issues due to view building")
, enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format. Deprecated, please use \"sstable_format\" instead.")
, enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Unused, true, "Enable SSTables 'md' format to be used as the default file format. Deprecated, please use \"sstable_format\" instead.")
, sstable_format(this, "sstable_format", value_status::Used, "me", "Default sstable file format", {"md", "me"})
, sstable_format(this, "sstable_format", value_status::Used, "me", "Default sstable file format", {"mc", "md", "me"})
, enable_dangerous_direct_import_of_cassandra_counters(this, "enable_dangerous_direct_import_of_cassandra_counters", value_status::Used, false, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1."
" It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.")
, enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance")
@@ -907,7 +907,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, force_schema_commit_log(this, "force_schema_commit_log", value_status::Used, false,
"Use separate schema commit log unconditionally rater than after restart following discovery of cluster-wide support for it.")
, task_ttl_seconds(this, "task_ttl_in_seconds", liveness::LiveUpdate, value_status::Used, 10, "Time for which information about finished task stays in memory.")
, cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, false,
, cache_index_pages(this, "cache_index_pages", liveness::LiveUpdate, value_status::Used, true,
"Keep SSTable index pages in the global cache after a SSTable read. Expected to improve performance for workloads with big partitions, but may degrade performance for workloads with small partitions.")
, default_log_level(this, "default_log_level", value_status::Used)
, logger_log_level(this, "logger_log_level", value_status::Used)
@@ -1065,7 +1065,7 @@ std::map<sstring, db::experimental_features_t::feature> db::experimental_feature
{"udf", feature::UDF},
{"cdc", feature::UNUSED},
{"alternator-streams", feature::ALTERNATOR_STREAMS},
{"alternator-ttl", feature::UNUSED },
{"alternator-ttl", feature::ALTERNATOR_TTL},
{"raft", feature::RAFT},
{"broadcast-tables", feature::BROADCAST_TABLES},
{"keyspace-storage-options", feature::KEYSPACE_STORAGE_OPTIONS},

View File

@@ -84,7 +84,7 @@ struct experimental_features_t {
// NOTE: RAFT and BROADCAST_TABLES features are not enabled via `experimental` umbrella flag.
// These options should be enabled explicitly.
// RAFT feature has to be enabled if BROADCAST_TABLES is enabled.
enum class feature { UNUSED, UDF, ALTERNATOR_STREAMS, RAFT,
enum class feature { UNUSED, UDF, ALTERNATOR_STREAMS, ALTERNATOR_TTL, RAFT,
BROADCAST_TABLES, KEYSPACE_STORAGE_OPTIONS };
static std::map<sstring, feature> map(); // See enum_option.
static std::vector<enum_option<experimental_features_t>> all();

View File

@@ -33,7 +33,7 @@ bool host_filter::can_hint_for(const locator::topology& topo, gms::inet_address
case enabled_kind::enabled_for_all:
return true;
case enabled_kind::enabled_selectively:
return topo.has_endpoint(ep, locator::topology::pending::yes) && _dcs.contains(topo.get_datacenter(ep));
return _dcs.contains(topo.get_datacenter(ep));
case enabled_kind::disabled_for_all:
return false;
}

View File

@@ -96,7 +96,7 @@ void manager::register_metrics(const sstring& group_name) {
future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr) {
_proxy_anchor = std::move(proxy_ptr);
_gossiper_anchor = std::move(gossiper_ptr);
return lister::scan_dir(_hints_dir, lister::dir_entry_types::of<directory_entry_type::directory>(), [this] (fs::path datadir, directory_entry de) {
return lister::scan_dir(_hints_dir, { directory_entry_type::directory }, [this] (fs::path datadir, directory_entry de) {
ep_key_type ep = ep_key_type(de.name);
if (!check_dc_for(ep)) {
return make_ready_future<>();
@@ -558,7 +558,7 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
return true;
} else {
if (!_state.contains(state::ep_state_left_the_ring)) {
_state.set_if<state::ep_state_left_the_ring>(!_shard_manager.local_db().get_token_metadata().is_normal_token_owner(end_point_key()));
_state.set_if<state::ep_state_left_the_ring>(!_shard_manager.local_db().get_token_metadata().is_member(end_point_key()));
}
// send the hints out if the destination Node is part of the ring - we will send to all new replicas in this case
return _state.contains(state::ep_state_left_the_ring);
@@ -656,7 +656,7 @@ future<> manager::change_host_filter(host_filter filter) {
// Iterate over existing hint directories and see if we can enable an endpoint manager
// for some of them
return lister::scan_dir(_hints_dir, lister::dir_entry_types::of<directory_entry_type::directory>(), [this] (fs::path datadir, directory_entry de) {
return lister::scan_dir(_hints_dir, { directory_entry_type::directory }, [this] (fs::path datadir, directory_entry de) {
const ep_key_type ep = ep_key_type(de.name);
if (_ep_managers.contains(ep) || !_host_filter.can_hint_for(_proxy_anchor->get_token_metadata_ptr()->get_topology(), ep)) {
return make_ready_future<>();
@@ -1168,7 +1168,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
}
static future<> scan_for_hints_dirs(const sstring& hints_directory, std::function<future<> (fs::path dir, directory_entry de, unsigned shard_id)> f) {
return lister::scan_dir(hints_directory, lister::dir_entry_types::of<directory_entry_type::directory>(), [f = std::move(f)] (fs::path dir, directory_entry de) mutable {
return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::move(f)] (fs::path dir, directory_entry de) mutable {
unsigned shard_id;
try {
shard_id = std::stoi(de.name.c_str());
@@ -1188,10 +1188,10 @@ manager::hints_segments_map manager::get_current_hints_segments(const sstring& h
scan_for_hints_dirs(hints_directory, [&current_hints_segments] (fs::path dir, directory_entry de, unsigned shard_id) {
manager_logger.trace("shard_id = {}", shard_id);
// IPs level
return lister::scan_dir(dir / de.name.c_str(), lister::dir_entry_types::of<directory_entry_type::directory>(), [&current_hints_segments, shard_id] (fs::path dir, directory_entry de) {
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [&current_hints_segments, shard_id] (fs::path dir, directory_entry de) {
manager_logger.trace("\tIP: {}", de.name);
// hints files
return lister::scan_dir(dir / de.name.c_str(), lister::dir_entry_types::of<directory_entry_type::regular>(), [&current_hints_segments, shard_id, ep_addr = de.name] (fs::path dir, directory_entry de) {
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::regular }, [&current_hints_segments, shard_id, ep_addr = de.name] (fs::path dir, directory_entry de) {
manager_logger.trace("\t\tfile: {}", de.name);
current_hints_segments[ep_addr][shard_id].emplace_back(dir / de.name.c_str());
return make_ready_future<>();
@@ -1305,7 +1305,7 @@ void manager::remove_irrelevant_shards_directories(const sstring& hints_director
scan_for_hints_dirs(hints_directory, [] (fs::path dir, directory_entry de, unsigned shard_id) {
if (shard_id >= smp::count) {
// IPs level
return lister::scan_dir(dir / de.name.c_str(), lister::dir_entry_types::full(), lister::show_hidden::yes, [] (fs::path dir, directory_entry de) {
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (fs::path dir, directory_entry de) {
return io_check(remove_file, (dir / de.name.c_str()).native());
}).then([shard_base_dir = dir, shard_entry = de] {
return io_check(remove_file, (shard_base_dir / shard_entry.name.c_str()).native());

View File

@@ -99,7 +99,7 @@ future<> space_watchdog::scan_one_ep_dir(fs::path path, manager& shard_manager,
if (!exists) {
return make_ready_future<>();
} else {
return lister::scan_dir(path, lister::dir_entry_types::of<directory_entry_type::regular>(), [this, ep_key, &shard_manager] (fs::path dir, directory_entry de) {
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (fs::path dir, directory_entry de) {
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
if (_files_count == 1) {
shard_manager.add_ep_with_pending_hints(ep_key);
@@ -138,7 +138,7 @@ void space_watchdog::on_timer() {
_total_size = 0;
for (manager& shard_manager : per_device_limits.managers) {
shard_manager.clear_eps_with_pending_hints();
lister::scan_dir(shard_manager.hints_dir(), lister::dir_entry_types::of<directory_entry_type::directory>(), [this, &shard_manager] (fs::path dir, directory_entry de) {
lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (fs::path dir, directory_entry de) {
_files_count = 0;
// Let's scan per-end-point directories and enumerate hints files...
//

View File

@@ -355,7 +355,6 @@ schema_ptr system_keyspace::built_indexes() {
}
/*static*/ schema_ptr system_keyspace::peers() {
constexpr uint16_t schema_version_offset = 1; // raft_server_id
static thread_local auto peers = [] {
schema_builder builder(generate_legacy_id(NAME, PEERS), NAME, PEERS,
// partition key
@@ -373,7 +372,6 @@ schema_ptr system_keyspace::built_indexes() {
{"schema_version", uuid_type},
{"tokens", set_type_impl::get_instance(utf8_type, true)},
{"supported_features", utf8_type},
{"raft_server_id", uuid_type},
},
// static columns
{},
@@ -383,7 +381,7 @@ schema_ptr system_keyspace::built_indexes() {
"information about known peers in the cluster"
);
builder.set_gc_grace_seconds(0);
builder.with_version(generate_schema_version(builder.uuid(), schema_version_offset));
builder.with_version(generate_schema_version(builder.uuid()));
return builder.build(schema_builder::compact_storage::no);
}();
return peers;
@@ -1504,7 +1502,6 @@ future<> system_keyspace::update_tokens(gms::inet_address ep, const std::unorder
}
sstring req = format("INSERT INTO system.{} (peer, tokens) VALUES (?, ?)", PEERS);
slogger.debug("INSERT INTO system.{} (peer, tokens) VALUES ({}, {})", PEERS, ep, tokens);
auto set_type = set_type_impl::get_instance(utf8_type, true);
co_await execute_cql(req, ep.addr(), make_set_value(set_type, prepare_tokens(tokens))).discard_result();
co_await force_blocking_flush(PEERS);
@@ -1544,18 +1541,11 @@ future<std::unordered_map<gms::inet_address, locator::host_id>> system_keyspace:
}
future<std::vector<gms::inet_address>> system_keyspace::load_peers() {
auto res = co_await execute_cql(format("SELECT peer, tokens FROM system.{}", PEERS));
auto res = co_await execute_cql(format("SELECT peer FROM system.{}", PEERS));
assert(res);
std::vector<gms::inet_address> ret;
for (auto& row: *res) {
if (!row.has("tokens")) {
// Ignore rows that don't have tokens. Such rows may
// be introduced by code that persists parts of peer
// information (such as RAFT_ID) which may potentially
// race with deleting a peer (during node removal).
continue;
}
ret.emplace_back(row.get_as<net::inet_address>("peer"));
}
co_return ret;
@@ -1604,7 +1594,6 @@ future<> system_keyspace::update_peer_info(gms::inet_address ep, sstring column_
co_await update_cached_values(ep, column_name, value);
sstring req = format("INSERT INTO system.{} (peer, {}) VALUES (?, ?)", PEERS, column_name);
slogger.debug("INSERT INTO system.{} (peer, {}) VALUES ({}, {})", PEERS, column_name, ep, value);
co_await execute_cql(req, ep.addr(), value).discard_result();
}
// sets are not needed, since tokens are updated by another method
@@ -1656,7 +1645,6 @@ future<> system_keyspace::update_schema_version(table_schema_version version) {
*/
future<> system_keyspace::remove_endpoint(gms::inet_address ep) {
sstring req = format("DELETE FROM system.{} WHERE peer = ?", PEERS);
slogger.debug("DELETE FROM system.{} WHERE peer = {}", PEERS, ep);
co_await execute_cql(req, ep.addr()).discard_result();
co_await force_blocking_flush(PEERS);
}
@@ -1881,7 +1869,7 @@ public:
set_cell(cr, "host_id", hostid->uuid());
}
if (tm.is_normal_token_owner(endpoint)) {
if (tm.is_member(endpoint)) {
sstring dc = tm.get_topology().get_location(endpoint).dc;
set_cell(cr, "dc", dc);
}
@@ -2479,25 +2467,23 @@ class db_config_table final : public streaming_virtual_table {
return make_exception_future<>(virtual_table_update_exception("option source is not updateable"));
}
return smp::submit_to(0, [&cfg = _cfg, name = std::move(*name), value = std::move(*value)] () mutable -> future<> {
return smp::submit_to(0, [&cfg = _cfg, name = std::move(*name), value = std::move(*value)] () mutable {
for (auto& c_ref : cfg.values()) {
auto& c = c_ref.get();
if (c.name() == name) {
std::exception_ptr ex;
try {
if (co_await c.set_value_on_all_shards(value, utils::config_file::config_source::CQL)) {
co_return;
if (c.set_value(value, utils::config_file::config_source::CQL)) {
return cfg.broadcast_to_all_shards();
} else {
ex = std::make_exception_ptr(virtual_table_update_exception("option is not live-updateable"));
return make_exception_future<>(virtual_table_update_exception("option is not live-updateable"));
}
} catch (boost::bad_lexical_cast&) {
ex = std::make_exception_ptr(virtual_table_update_exception("cannot parse option value"));
return make_exception_future<>(virtual_table_update_exception("cannot parse option value"));
}
co_await coroutine::return_exception_ptr(std::move(ex));
}
}
co_await coroutine::return_exception(virtual_table_update_exception("no such option"));
return make_exception_future<>(virtual_table_update_exception("no such option"));
});
}
@@ -2895,7 +2881,7 @@ future<> system_keyspace::get_repair_history(::table_id table_id, repair_history
sstring req = format("SELECT * from system.{} WHERE table_uuid = {}", REPAIR_HISTORY, table_id);
co_await _qp.local().query_internal(req, [&f] (const cql3::untyped_result_set::row& row) mutable -> future<stop_iteration> {
repair_history_entry ent;
ent.id = tasks::task_id(row.get_as<utils::UUID>("repair_uuid"));
ent.id = row.get_as<tasks::task_id>("repair_uuid");
ent.table_uuid = ::table_id(row.get_as<utils::UUID>("table_uuid"));
ent.range_start = row.get_as<int64_t>("range_start");
ent.range_end = row.get_as<int64_t>("range_end");

View File

@@ -128,9 +128,6 @@ const column_definition* view_info::view_column(const column_definition& base_de
void view_info::set_base_info(db::view::base_info_ptr base_info) {
_base_info = std::move(base_info);
// Forget the cached objects which may refer to the base schema.
_select_statement = nullptr;
_partition_slice = std::nullopt;
}
// A constructor for a base info that can facilitate reads and writes from the materialized view.
@@ -1394,9 +1391,9 @@ static std::optional<gms::inet_address>
get_view_natural_endpoint(const sstring& keyspace_name,
const dht::token& base_token, const dht::token& view_token) {
auto &db = service::get_local_storage_proxy().local_db();
auto& topology = service::get_local_storage_proxy().get_token_metadata_ptr()->get_topology();
auto& ks = db.find_keyspace(keyspace_name);
auto erm = ks.get_effective_replication_map();
auto& topology = erm->get_token_metadata_ptr()->get_topology();
auto my_address = utils::fb_utilities::get_broadcast_address();
auto my_datacenter = topology.get_datacenter();
bool network_topology = dynamic_cast<const locator::network_topology_strategy*>(&ks.get_replication_strategy());

View File

@@ -15,7 +15,6 @@
#include "sstables/sstables.hh"
#include "sstables/progress_monitor.hh"
#include "readers/evictable.hh"
#include "dht/partition_filter.hh"
static logging::logger vug_logger("view_update_generator");
@@ -159,8 +158,7 @@ future<> view_update_generator::start() {
::mutation_reader::forwarding::no);
inject_failure("view_update_generator_consume_staging_sstable");
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle),
dht::incremental_owned_ranges_checker::make_partition_filter(_db.get_keyspace_local_ranges(s->ks_name())));
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, std::move(permit), *t, sstables, _as, staging_sstable_reader_handle));
staging_sstable_reader.close().get();
if (result == stop_iteration::yes) {
break;

View File

@@ -9,9 +9,7 @@
#include "i_partitioner.hh"
#include "sharder.hh"
#include <seastar/core/seastar.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include "dht/token-sharding.hh"
#include "dht/partition_filter.hh"
#include "utils/class_registrator.hh"
#include "types.hh"
#include "utils/murmur_hash.hh"
@@ -364,79 +362,4 @@ split_range_to_shards(dht::partition_range pr, const schema& s) {
return ret;
}
flat_mutation_reader_v2::filter incremental_owned_ranges_checker::make_partition_filter(const dht::token_range_vector& sorted_owned_ranges) {
return [checker = incremental_owned_ranges_checker(sorted_owned_ranges)] (const dht::decorated_key& dk) mutable {
return checker.belongs_to_current_node(dk.token());
};
}
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& source_ranges, dht::partition_range_vector ranges_to_subtract) {
auto cmp = dht::ring_position_comparator(schema);
// optimize set of potentially overlapping ranges by deoverlapping them.
auto ranges = dht::partition_range::deoverlap(source_ranges, cmp);
dht::partition_range_vector res;
res.reserve(ranges.size() * 2);
auto range = ranges.begin();
auto range_end = ranges.end();
auto range_to_subtract = ranges_to_subtract.begin();
auto range_to_subtract_end = ranges_to_subtract.end();
while (range != range_end) {
if (range_to_subtract == range_to_subtract_end) {
// We're done with range_to_subtracts
res.emplace_back(std::move(*range));
++range;
continue;
}
auto diff = range->subtract(*range_to_subtract, cmp);
auto size = diff.size();
switch (size) {
case 0:
// current range is fully covered by range_to_subtract, done with it
// range_to_subtrace.start <= range.start &&
// range_to_subtrace.end >= range.end
++range;
break;
case 1:
// Possible cases:
// a. range and range_to_subtract are disjoint (so diff == range)
// a.i range_to_subtract.end < range.start
// a.ii range_to_subtract.start > range.end
// b. range_to_subtrace.start > range.start, so it removes the range suffix
// c. range_to_subtrace.start < range.start, so it removes the range prefix
// Does range_to_subtract sort after range?
if (range_to_subtract->start() && (!range->start() || cmp(range_to_subtract->start()->value(), range->start()->value()) > 0)) {
// save range prefix in the result
// (note that diff[0] == range in the disjoint case)
res.emplace_back(std::move(diff[0]));
// done with current range
++range;
} else {
// set the current range to the remaining suffix
*range = std::move(diff[0]);
// done with current range_to_subtract
++range_to_subtract;
}
break;
case 2:
// range contains range_to_subtract
// save range prefix in the result
res.emplace_back(std::move(diff[0]));
// set the current range to the remaining suffix
*range = std::move(diff[1]);
// done with current range_to_subtract
++range_to_subtract;
break;
default:
assert(size <= 2);
}
co_await coroutine::maybe_yield();
}
co_return res;
}
}

View File

@@ -648,11 +648,6 @@ future<utils::chunked_vector<partition_range>> split_range_to_single_shard(const
std::unique_ptr<dht::i_partitioner> make_partitioner(sstring name);
// Returns a sorted and deoverlapped list of ranges that are
// the result of subtracting all ranges from ranges_to_subtract.
// ranges_to_subtract must be sorted and deoverlapped.
future<dht::partition_range_vector> subtract_ranges(const schema& schema, const dht::partition_range_vector& ranges, dht::partition_range_vector ranges_to_subtract);
} // dht
namespace std {

View File

@@ -1,41 +0,0 @@
/*
* Modified by ScyllaDB
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
*/
#pragma once
#include "dht/i_partitioner.hh"
#include "readers/flat_mutation_reader_v2.hh"
namespace dht {
class incremental_owned_ranges_checker {
const dht::token_range_vector& _sorted_owned_ranges;
mutable dht::token_range_vector::const_iterator _it;
public:
incremental_owned_ranges_checker(const dht::token_range_vector& sorted_owned_ranges)
: _sorted_owned_ranges(sorted_owned_ranges)
, _it(_sorted_owned_ranges.begin()) {
}
// Must be called with increasing token values.
bool belongs_to_current_node(const dht::token& t) {
// While token T is after a range Rn, advance the iterator.
// iterator will be stopped at a range which either overlaps with T (if T belongs to node),
// or at a range which is after T (if T doesn't belong to this node).
while (_it != _sorted_owned_ranges.end() && _it->after(t, dht::token_comparator())) {
_it++;
}
return _it != _sorted_owned_ranges.end() && _it->contains(t, dht::token_comparator());
}
static flat_mutation_reader_v2::filter make_partition_filter(const dht::token_range_vector& sorted_owned_ranges);
};
} // dht

View File

@@ -7,8 +7,6 @@
*/
#pragma once
#include "utils/UUID.hh"
#include <seastar/core/sharded.hh>
using namespace seastar;
@@ -23,7 +21,7 @@ class pinger {
public:
// Opaque endpoint ID.
// A specific implementation of `pinger` maps those IDs to 'real' addresses.
using endpoint_id = utils::UUID;
using endpoint_id = unsigned;
// Send a message to `ep` and wait until it responds.
// The wait can be aborted using `as`.

21
dist/common/scripts/scylla_bootparam_setup vendored Executable file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2018-present ScyllaDB
#
#
# SPDX-License-Identifier: AGPL-3.0-or-later
import os
import sys
import argparse
# keep this script just for compatibility.
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Optimize boot parameter settings for Scylla.')
parser.add_argument('--ami', action='store_true', default=False,
help='setup AMI instance')
args = parser.parse_args()
sys.exit(0)

View File

@@ -139,8 +139,13 @@ if __name__ == '__main__':
print('Requires root permission.')
sys.exit(1)
cfg = sysconfig_parser(sysconfdir_p() / 'scylla-server')
ami = cfg.get('AMI')
mode = cfg.get('NETWORK_MODE')
if ami == 'yes' and os.path.exists('/etc/scylla/ami_disabled'):
os.remove('/etc/scylla/ami_disabled')
sys.exit(1)
if mode == 'virtio':
tap = cfg.get('TAP')
user = cfg.get('USER')

View File

@@ -214,7 +214,7 @@ if __name__ == '__main__':
help='skip raid setup')
parser.add_argument('--raid-level-5', action='store_true', default=False,
help='use RAID5 for RAID volume')
parser.add_argument('--online-discard', default=1, choices=[0, 1], type=int,
parser.add_argument('--online-discard', default=True,
help='Configure XFS to discard unused blocks as soon as files are deleted')
parser.add_argument('--nic',
help='specify NIC')
@@ -224,6 +224,8 @@ if __name__ == '__main__':
help='specify swapfile directory (ex: /)')
parser.add_argument('--swap-size', type=int,
help='specify swapfile size in GB')
parser.add_argument('--ami', action='store_true', default=False,
help='setup AMI instance')
parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
help='optimize NIC and disks')
parser.add_argument('--developer-mode', action='store_true', default=False,
@@ -240,6 +242,8 @@ if __name__ == '__main__':
if is_redhat_variant():
parser.add_argument('--no-selinux-setup', action='store_true', default=False,
help='skip selinux setup')
parser.add_argument('--no-bootparam-setup', action='store_true', default=False,
help='skip bootparam setup')
parser.add_argument('--no-ntp-setup', action='store_true',
default=default_no_ntp_setup,
help='skip ntp setup')
@@ -454,7 +458,7 @@ if __name__ == '__main__':
args.no_raid_setup = not raid_setup
if raid_setup:
level = '5' if raid_level_5 else '0'
run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={online_discard}')
run_setup_script('RAID', f'scylla_raid_setup --disks {disks} --enable-on-nextboot --raid-level={level} --online-discard={int(online_discard)}')
coredump_setup = interactive_ask_service('Do you want to enable coredumps?', 'Yes - sets up coredump to allow a post-mortem analysis of the Scylla state just prior to a crash. No - skips this step.', coredump_setup)
args.no_coredump_setup = not coredump_setup

View File

@@ -35,6 +35,7 @@ if __name__ == '__main__':
disable_writeback_cache = str2bool(cfg.get('DISABLE_WRITEBACK_CACHE'))
else:
disable_writeback_cache = 'no'
ami = str2bool(cfg.get('AMI'))
parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
parser.add_argument('--nic',
@@ -57,6 +58,8 @@ if __name__ == '__main__':
help='Set enforcing fastest available Linux clocksource')
parser.add_argument('--disable-writeback-cache', action='store_true', default=disable_writeback_cache,
help='Disable disk writeback cache')
parser.add_argument('--ami', action='store_true', default=ami,
help='AMI instance mode')
args = parser.parse_args()
if args.nic and not is_valid_nic(args.nic):
@@ -122,4 +125,6 @@ if __name__ == '__main__':
if cfg.has_option('DISABLE_WRITEBACK_CACHE') and str2bool(cfg.get('DISABLE_WRITEBACK_CACHE')) != args.disable_writeback_cache:
cfg.set('DISABLE_WRITEBACK_CACHE', bool2str(args.disable_writeback_cache))
if str2bool(cfg.get('AMI')) != args.ami:
cfg.set('AMI', bool2str(args.ami))
cfg.commit()

View File

@@ -43,5 +43,8 @@ SCYLLA_ARGS="--log-to-syslog 1 --log-to-stdout 0 --default-log-level info --netw
## scylla arguments (for dpdk mode)
#SCYLLA_ARGS="--log-to-syslog 1 --log-to-stdout 0 --default-log-level info --network-stack native --dpdk-pmd"
# setup as AMI instance
AMI=no
# Disable disk writeback cache
DISABLE_WRITEBACK_CACHE=no

View File

@@ -144,25 +144,6 @@ This monitoring stack is different from DynamoDB's offering - but Scylla's
is significantly more powerful and gives the user better insights on
the internals of the database and its performance.
## Time To Live (TTL)
Like in DynamoDB, Alternator items which are set to expire at a certain
time will not disappear exactly at that time, but only after some delay.
DynamoDB guarantees that the expiration delay will be less than 48 hours
(though for small tables the delay is often much shorter).
In Alternator, the expiration delay is configurable - it can be set
with the `--alternator-ttl-period-in-seconds` configuration option.
The default is 24 hours.
One thing the implementation is missing is that expiration
events appear in the Streams API as normal deletions - without the
distinctive marker on deletions which are really expirations.
See <https://github.com/scylladb/scylla/issues/5060>.
---
## Experimental API features
Some DynamoDB API features are supported by Alternator, but considered
@@ -173,11 +154,28 @@ feature's implementation is still subject to change and upgrades may not be
possible if such a feature is used. For these reasons, experimental features
are not recommended for mission-critical uses, and they need to be
individually enabled with the "--experimental-features" configuration option.
See [Enabling Experimental Features](/operating-scylla/admin#enabling-experimental-features) for details.
In this release, the following DynamoDB API features are considered
experimental:
* DynamoDB's TTL (item expiration) feature is supported, but in this release
still considered experimental and needs to be enabled explicitly with the
`--experimental-features=alternator-ttl` configuration option.
The experimental implementation is mostly complete, but not throughly
tested or optimized.
Like in DynamoDB, Alternator items which are set to expire at a certain
time will not disappear exactly at that time, but only after some delay.
DynamoDB guarantees that the expiration delay will be less than 48 hours
(though for small tables the delay is often much shorter). In Alternator,
the expiration delay is configurable - it defaults to 24 hours but can
be set with the `--alternator-ttl-period-in-seconds` configuration option.
One thing that this implementation is still missing is that expiration
events appear in the Streams API as normal deletions - without the
distinctive marker on deletions which are really expirations.
<https://github.com/scylladb/scylla/issues/5060>
* The DynamoDB Streams API for capturing change is supported, but still
considered experimental so needs to be enabled explicitly with the
`--experimental-features=alternator-streams` configuration option.

View File

@@ -4,231 +4,95 @@ Raft Consensus Algorithm in ScyllaDB
Introduction
--------------
ScyllaDB was originally designed, following Apache Cassandra, to use gossip for topology and schema updates and the Paxos consensus algorithm for
strong data consistency (:doc:`LWT </using-scylla/lwt>`). To achieve stronger consistency without performance penalty, ScyllaDB 5.x has turned to Raft - a consensus algorithm designed as an alternative to both gossip and Paxos.
ScyllaDB was originally designed, following Apache Cassandra, to use gossip for topology and schema updates and the Paxos consensus algorithm for
strong data consistency (:doc:`LWT </using-scylla/lwt>`). To achieve stronger consistency without performance penalty, ScyllaDB 5.0 is turning to Raft - a consensus algorithm designed as an alternative to both gossip and Paxos.
Raft is a consensus algorithm that implements a distributed, consistent, replicated log across members (nodes). Raft implements consensus by first electing a distinguished leader, then giving the leader complete responsibility for managing the replicated log. The leader accepts log entries from clients, replicates them on other servers, and tells servers when it is safe to apply log entries to their state machines.
Raft uses a heartbeat mechanism to trigger a leader election. All servers start as followers and remain in the follower state as long as they receive valid RPCs (heartbeat) from a leader or candidate. A leader sends periodic heartbeats to all followers to maintain his authority (leadership). Suppose a follower receives no communication over a period called the election timeout. In that case, it assumes no viable leader and begins an election to choose a new leader.
Leader selection is described in detail in the `Raft paper <https://raft.github.io/raft.pdf>`_.
Leader selection is described in detail in the `raft paper <https://raft.github.io/raft.pdf>`_.
ScyllaDB 5.x may use Raft to maintain schema updates in every node (see below). Any schema update, like ALTER, CREATE or DROP TABLE, is first committed as an entry in the replicated Raft log, and, once stored on most replicas, applied to all nodes **in the same order**, even in the face of a node or network failures.
Scylla 5.0 uses Raft to maintain schema updates in every node (see below). Any schema update, like ALTER, CREATE or DROP TABLE, is first committed as an entry in the replicated Raft log, and, once stored on most replicas, applied to all nodes **in the same order**, even in the face of a node or network failures.
Following ScyllaDB 5.x releases will use Raft to guarantee consistent topology updates similarly.
Following Scylla 5.x releases will use Raft to guarantee consistent topology updates similarly.
.. _raft-quorum-requirement:
Quorum Requirement
-------------------
Raft requires at least a quorum of nodes in a cluster to be available. If multiple nodes fail
and the quorum is lost, the cluster is unavailable for schema updates. See :ref:`Handling Failures <raft-handling-failures>`
Raft requires at least a quorum of nodes in a cluster to be available. If multiple nodes fail
and the quorum is lost, the cluster is unavailable for schema updates. See :ref:`Handling Failures <raft-handliing-failures>`
for information on how to handle failures.
Upgrade Considerations for SyllaDB 5.0 and Later
==================================================
Note that when you have a two-DC cluster with the same number of nodes in each DC, the cluster will lose the quorum if one
Note that when you have a two-DC cluster with the same number of nodes in each DC, the cluster will lose the quorum if one
of the DCs is down.
**We recommend configuring three DCs per cluster to ensure that the cluster remains available and operational when one DC is down.**
Enabling Raft
---------------
Enabling Raft in ScyllaDB 5.0 and 5.1
=====================================
Enabling Raft in ScyllaDB 5.0
===============================
.. warning::
In ScyllaDB 5.0 and 5.1, Raft is an experimental feature.
.. note::
In ScyllaDB 5.0:
It is not possible to enable Raft in an existing cluster in ScyllaDB 5.0 and 5.1.
In order to have a Raft-enabled cluster in these versions, you must create a new cluster with Raft enabled from the start.
* Raft is an experimental feature.
* Raft implementation only covers safe schema changes. See :ref:`Safe Schema Changes with Raft <raft-schema-changes>`.
.. warning::
**Do not** use Raft in production clusters in ScyllaDB 5.0 and 5.1. Such clusters won't be able to correctly upgrade to ScyllaDB 5.2.
Use Raft only for testing and experimentation in clusters which can be thrown away.
.. warning::
Once enabled, Raft cannot be disabled on your cluster. The cluster nodes will fail to restart if you remove the Raft feature.
When creating a new cluster, add ``raft`` to the list of experimental features in your ``scylla.yaml`` file:
If you are creating a new cluster, add ``raft`` to the list of experimental features in your ``scylla.yaml`` file:
.. code-block:: yaml
experimental_features:
- raft
.. _enabling-raft-existing-cluster:
If you upgrade to ScyllaDB 5.0 from an earlier version, perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>`
updating the ``scylla.yaml`` file for **each node** in the cluster to enable the experimental Raft feature:
Enabling Raft in ScyllaDB 5.2 and further
=========================================
.. code-block:: yaml
experimental_features:
- raft
.. TODO include enterprise versions in this documentation
.. note::
In ScyllaDB 5.2, Raft is Generally Available and can be safely used for consistent schema management.
In ScyllaDB 5.3 it will become enabled by default.
In further versions it will be mandatory.
ScyllaDB 5.2 and later comes equipped with a procedure that can setup Raft-based consistent cluster management in an existing cluster. We refer to this as the **internal Raft upgrade procedure** (do not confuse with the :doc:`ScyllaDB version upgrade procedure </upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic>`).
When all the nodes in the cluster and updated and restarted, the cluster will begin to use Raft for schema changes.
.. warning::
Once enabled, Raft cannot be disabled on your cluster. The cluster nodes will fail to restart if you remove the Raft feature.
To enable Raft in an existing cluster in Scylla 5.2 and beyond:
* ensure that the schema is synchronized in the cluster by executing :doc:`nodetool describecluster </operating-scylla/nodetool-commands/describecluster>` on each node and ensuring that the schema version is the same on all nodes,
* then perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>`, updating the ``scylla.yaml`` file for **each node** in the cluster before restarting it to enable the ``consistent_cluster_management`` flag:
.. code-block:: yaml
consistent_cluster_management: true
When all the nodes in the cluster and updated and restarted, the cluster will start the **internal Raft upgrade procedure**.
**You must then verify** that the internal Raft upgrade procedure has finished successfully. Refer to the :ref:`next section <verify-raft-procedure>`.
You can also enable the ``consistent_cluster_management`` flag while performing :doc:`rolling upgrade from 5.1 to 5.2 </upgrade/upgrade-opensource/upgrade-guide-from-5.1-to-5.2/upgrade-guide-from-5.1-to-5.2-generic>`: update ``scylla.yaml`` before restarting each node. The internal Raft upgrade procedure will start as soon as the last node was upgraded and restarted. As above, this requires :ref:`verifying <verify-raft-procedure>` that this internal procedure successfully finishes.
Finally, you can enable the ``consistent_cluster_management`` flag when creating a new cluster. This does not use the internal Raft upgrade procedure; instead, Raft is functioning in the cluster and managing schema right from the start.
Until all nodes are restarted with ``consistent_cluster_management: true``, it is still possible to turn this option back off. Once enabled on every node, it must remain turned on (or the node will refuse to restart).
.. _verify-raft-procedure:
Verifying that the internal Raft upgrade procedure finished successfully
========================================================================
.. versionadded:: 5.2
The internal Raft upgrade procedure starts as soon as every node in the cluster restarts with ``consistent_cluster_management`` flag enabled in ``scylla.yaml``.
.. TODO: update the above sentence once 5.3 and later are released.
The procedure requires **full cluster availability** to correctly setup the Raft algorithm; after the setup finishes, Raft can proceed with only a majority of nodes, but this initial setup is an exception.
An unlucky event, such as a hardware failure, may cause one of your nodes to fail. If this happens before the internal Raft upgrade procedure finishes, the procedure will get stuck and your intervention will be required.
To verify that the procedure finishes, look at the log of every Scylla node (using ``journalctl _COMM=scylla``). Search for the following patterns:
* ``Starting internal upgrade-to-raft procedure`` denotes the start of the procedure,
* ``Raft upgrade finished`` denotes the end.
The following is an example of a log from a node which went through the procedure correctly. Some parts were truncated for brevity:
.. code-block:: console
features - Feature SUPPORTS_RAFT_CLUSTER_MANAGEMENT is enabled
raft_group0 - finish_setup_after_join: SUPPORTS_RAFT feature enabled. Starting internal upgrade-to-raft procedure.
raft_group0_upgrade - starting in `use_pre_raft_procedures` state.
raft_group0_upgrade - Waiting until everyone is ready to start upgrade...
raft_group0_upgrade - Joining group 0...
raft_group0 - server 624fa080-8c0e-4e3d-acf6-10af473639ca joined group 0 with group id 8f8a1870-5c4e-11ed-bb13-fe59693a23c9
raft_group0_upgrade - Waiting until every peer has joined Raft group 0...
raft_group0_upgrade - Every peer is a member of Raft group 0.
raft_group0_upgrade - Waiting for schema to synchronize across all nodes in group 0...
raft_group0_upgrade - synchronize_schema: my version: a37a3b1e-5251-3632-b6b4-a9468a279834
raft_group0_upgrade - synchronize_schema: schema mismatches: {}. 3 nodes had a matching version.
raft_group0_upgrade - synchronize_schema: finished.
raft_group0_upgrade - Entering synchronize state.
raft_group0_upgrade - Schema changes are disabled in synchronize state. If a failure makes us unable to proceed, manual recovery will be required.
raft_group0_upgrade - Waiting for all peers to enter synchronize state...
raft_group0_upgrade - All peers in synchronize state. Waiting for schema to synchronize...
raft_group0_upgrade - synchronize_schema: collecting schema versions from group 0 members...
raft_group0_upgrade - synchronize_schema: collected remote schema versions.
raft_group0_upgrade - synchronize_schema: my version: a37a3b1e-5251-3632-b6b4-a9468a279834
raft_group0_upgrade - synchronize_schema: schema mismatches: {}. 3 nodes had a matching version.
raft_group0_upgrade - synchronize_schema: finished.
raft_group0_upgrade - Schema synchronized.
raft_group0_upgrade - Raft upgrade finished.
In a functioning cluster with good network connectivity the procedure should take no more than a few seconds.
Network issues may cause the procedure to take longer, but if all nodes are alive and the network is eventually functional (each pair of nodes is eventually connected), the procedure will eventually finish.
Note the following message, which appears in the log presented above:
.. code-block:: console
Schema changes are disabled in synchronize state. If a failure makes us unable to proceed, manual recovery will be required.
During the procedure, there is a brief window while schema changes are disabled. This is when the schema change mechanism switches from the older unsafe algorithm to the safe Raft-based algorithm. If everything runs smoothly, this window will be unnoticeable; the procedure is designed to minimize that window's length. However, if the procedure gets stuck e.g. due to network connectivity problem, ScyllaDB will return the following error when trying to perform a schema change during this window:
.. code-block:: console
Cannot perform schema or topology changes during this time; the cluster is currently upgrading to use Raft for schema operations.
If this error keeps happening, check the logs of your nodes to learn the state of upgrade. The upgrade procedure may get stuck
if there was a node failure.
In the next example, one of the nodes had a power outage before the procedure could finish. The following shows a part of another node's logs:
.. code-block:: console
raft_group0_upgrade - Entering synchronize state.
raft_group0_upgrade - Schema changes are disabled in synchronize state. If a failure makes us unable to proceed, manual recovery will be required.
raft_group0_upgrade - Waiting for all peers to enter synchronize state...
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.3 not in synchronize state yet...
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.1 not in synchronize state yet...
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: retrying in a while...
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.1 not in synchronize state yet...
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: retrying in a while...
...
raft_group0_upgrade - Raft upgrade procedure taking longer than expected. Please check if all nodes are live and the network is healthy. If the upgrade procedure does not progress even though the cluster is healthy, try performing a rolling restart of the cluster. If that doesn 't help or some nodes are dead and irrecoverable, manual recovery may be required. Consult the relevant documentation.
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: node 127.90.69.1 not in synchronize state yet...
raft_group0_upgrade - wait_for_peers_to_enter_synchronize_state: retrying in a while...
.. TODO: the 'Consult the relevant documentation' message must be updated to point to this doc.
Note the following message:
.. code-block:: console
raft_group0_upgrade - Raft upgrade procedure taking longer than expected. Please check if all nodes are live and the network is healthy. If the upgrade procedure does not progress even though the cluster is healthy, try performing a rolling restart of the cluster. If that doesn 't help or some nodes are dead and irrecoverable, manual recovery may be required. Consult the relevant documentation.
If the Raft upgrade procedure is stuck, this message will appear periodically in each node's logs.
The message suggests the initial course of action:
* Check if all nodes are alive.
* If a node is down but can be restarted, restart it.
* If all nodes are alive, ensure that the network is healthy: that every node is reachable from every other node.
* If all nodes are alive and the network is healthy, perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>` of the cluster.
One of the reasons why the procedure may get stuck is a pre-existing problem in schema definitions which causes schema to be unable to synchronize in the cluster. The procedure cannot proceed unless it ensures that schema is synchronized.
If **all nodes are alive and the network is healthy**, you performed a rolling restart, but the issue still persists, contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
If some nodes are **dead and irrecoverable**, you'll need to perform a manual recovery procedure. Consult :ref:`the section about Raft recovery <recover-raft-procedure>`.
Verifying that Raft is enabled
Verifying that Raft Is Enabled
===============================
You can verify that Raft is enabled on your cluster in one of the following ways:
.. versionadded:: 5.2
* Retrieve the list of supported features by running:
You can verify that Raft is enabled on your cluster by performing the following query on each node:
.. code-block:: sql
.. code-block:: sql
cqlsh> SELECT * FROM system.scylla_local WHERE key = 'group0_upgrade_state';
The query should return:
cqlsh> SELECT supported_features FROM system.local;
With Raft enabled, the list of supported features in the output includes ``SUPPORTS_RAFT_CLUSTER_MANAGEMENT``. For example:
.. code-block:: console
:class: hide-copy-button
supported_features
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
CDC,CDC_GENERATIONS_V2,COMPUTED_COLUMNS,CORRECT_COUNTER_ORDER,CORRECT_IDX_TOKEN_IN_SECONDARY_INDEX,CORRECT_NON_COMPOUND_RANGE_TOMBSTONES,CORRECT_STATIC_COMPACT_IN_MC,COUNTERS,DIGEST_FOR_NULL_VALUES,DIGEST_INSENSITIVE_TO_EXPIRY,DIGEST_MULTIPARTITION_READ,HINTED_HANDOFF_SEPARATE_CONNECTION,INDEXES,LARGE_PARTITIONS,LA_SSTABLE_FORMAT,LWT,MATERIALIZED_VIEWS,MC_SSTABLE_FORMAT,MD_SSTABLE_FORMAT,ME_SSTABLE_FORMAT,NONFROZEN_UDTS,PARALLELIZED_AGGREGATION,PER_TABLE_CACHING,PER_TABLE_PARTITIONERS,RANGE_SCAN_DATA_VARIANT,RANGE_TOMBSTONES,ROLES,ROW_LEVEL_REPAIR,SCHEMA_TABLES_V3,SEPARATE_PAGE_SIZE_AND_SAFETY_LIMIT,STREAM_WITH_RPC_STREAM,SUPPORTS_RAFT_CLUSTER_MANAGEMENT,TOMBSTONE_GC_OPTIONS,TRUNCATION_TABLE,UDA,UNBOUNDED_RANGE_TOMBSTONES,VIEW_VIRTUAL_COLUMNS,WRITE_FAILURE_REPLY,XXHASH
key | value
----------------------+--------------------------
group0_upgrade_state | use_post_raft_procedures
* Retrieve the list of experimental features by running:
(1 rows)
.. code-block:: sql
on every node.
If the query returns 0 rows, or ``value`` is ``synchronize`` or ``use_pre_raft_procedures``, it means that the cluster is in the middle of the internal Raft upgrade procedure; consult the :ref:`relevant section <verify-raft-procedure>`.
If ``value`` is ``recovery``, it means that the cluster is in the middle of the manual recovery procedure. The procedure must be finished. Consult :ref:`the section about Raft recovery <recover-raft-procedure>`.
If ``value`` is anything else, it might mean data corruption or a mistake when performing the manual recovery procedure. The value will be treated as if it was equal to ``recovery`` when the node is restarted.
cqlsh> SELECT value FROM system.config WHERE name = 'experimental_features'
With Raft enabled, the list of experimental features in the output includes ``raft``.
.. _raft-schema-changes:
@@ -236,23 +100,23 @@ Safe Schema Changes with Raft
-------------------------------
In ScyllaDB, schema is based on :doc:`Data Definition Language (DDL) </cql/ddl>`. In earlier ScyllaDB versions, schema changes were tracked via the gossip protocol, which might lead to schema conflicts if the updates are happening concurrently.
Implementing Raft eliminates schema conflicts and allows full automation of DDL changes under any conditions, as long as a quorum
Implementing Raft eliminates schema conflicts and allows full automation of DDL changes under any conditions, as long as a quorum
of nodes in the cluster is available. The following examples illustrate how Raft provides the solution to problems with schema changes.
* A network partition may lead to a split-brain case, where each subset of nodes has a different version of the schema.
With Raft, after a network split, the majority of the cluster can continue performing schema changes, while the minority needs to wait until it can rejoin the majority. Data manipulation statements on the minority can continue unaffected, provided the :ref:`quorum requirement <raft-quorum-requirement>` is satisfied.
* Two or more conflicting schema updates are happening at the same time. For example, two different columns with the same definition are simultaneously added to the cluster. There is no effective way to resolve the conflict - the cluster will employ the schema with the most recent timestamp, but changes related to the shadowed table will be lost.
* Two or more conflicting schema updates are happening at the same time. For example, two different columns with the same definition are simultaneously added to the cluster. There is no effective way to resolve the conflict - the cluster will employ the schema with the most recent timestamp, but changes related to the shadowed table will be lost.
With Raft, concurrent schema changes are safe.
With Raft, concurrent schema changes are safe.
In summary, Raft makes schema changes safe, but it requires that a quorum of nodes in the cluster is available.
.. _raft-handling-failures:
.. _raft-handliing-failures:
Handling Failures
------------------
@@ -277,10 +141,10 @@ Examples
- Try restarting the node. If the node is dead, :doc:`replace it with a new node </operating-scylla/procedures/cluster-management/replace-dead-node/>`.
* - 2 nodes
- Cluster is not fully operational. The data is available for reads and writes, but schema changes are impossible.
- Restart at least 1 of the 2 nodes that are down to regain quorum. If you cant recover at least 1 of the 2 nodes, consult the :ref:`manual Raft recovery section <recover-raft-procedure>`.
- Restart at least 1 of the 2 nodes that are down to regain quorum. If you cant recover at least 1 of the 2 nodes, contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
* - 1 datacenter
- Cluster is not fully operational. The data is available for reads and writes, but schema changes are impossible.
- When the DC comes back online, restart the nodes. If the DC does not come back online and nodes are lost, consult the :ref:`manual Raft recovery section <recover-raft-procedure>`.
- When the DC comes back online, restart the nodes. If the DC does not come back online and nodes are lost, :doc:`restore the latest cluster backup into a new cluster </operating-scylla/procedures/backup-restore/restore/>`. You can contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
.. list-table:: Cluster B: 2 datacenters, 6 nodes (3 nodes per DC)
@@ -295,10 +159,10 @@ Examples
- Try restarting the node(s). If the node is dead, :doc:`replace it with a new node </operating-scylla/procedures/cluster-management/replace-dead-node/>`.
* - 3 nodes
- Cluster is not fully operational. The data is available for reads and writes, but schema changes are impossible.
- Restart 1 of the 3 nodes that are down to regain quorum. If you cant recover at least 1 of the 3 failed nodes, consult the :ref:`manual Raft recovery section <recover-raft-procedure>`.
- Restart 1 of the 3 nodes that are down to regain quorum. If you cant recover at least 1 of the 3 failed nodes, contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
* - 1DC
- Cluster is not fully operational. The data is available for reads and writes, but schema changes are impossible.
- When the DCs come back online, restart the nodes. If the DC fails to come back online and the nodes are lost, consult the :ref:`manual Raft recovery section <recover-raft-procedure>`.
- When the DCs come back online, restart the nodes. If the DC fails to come back online and the nodes are lost, :doc:`restore the latest cluster backup into a new cluster </operating-scylla/procedures/backup-restore/restore/>`. You can contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
.. list-table:: Cluster C: 3 datacenter, 9 nodes (3 nodes per DC)
@@ -311,78 +175,13 @@ Examples
* - 1-4 nodes
- Schema updates are possible and safe.
- Try restarting the nodes. If the nodes are dead, :doc:`replace them with new nodes </operating-scylla/procedures/cluster-management/replace-dead-node-or-more/>`.
* - 1 DC
* - 1 DC
- Schema updates are possible and safe.
- When the DC comes back online, try restarting the nodes in the cluster. If the nodes are dead, :doc:`add 3 new nodes in a new region </operating-scylla/procedures/cluster-management/add-dc-to-existing-dc/>`.
* - 2 DCs
- Cluster is not fully operational. The data is available for reads and writes, but schema changes are impossible.
- When the DCs come back online, restart the nodes. If at least one DC fails to come back online and the nodes are lost, consult the :ref:`manual Raft recovery section <recover-raft-procedure>`.
.. _recover-raft-procedure:
Raft manual recovery procedure
==============================
.. versionadded:: 5.2
The manual Raft recovery procedure applies to the following situations:
* :ref:`The internal Raft upgrade procedure <verify-raft-procedure>` got stuck because one of your nodes failed in the middle of the procedure and is irrecoverable,
* or the cluster was running Raft but a majority of nodes (e.g. 2 our of 3) failed and are irrecoverable. Raft cannot progress unless a majority of nodes is available.
.. warning::
Perform the manual recovery procedure **only** if you're dealing with **irrecoverable** nodes. If it is possible to restart your nodes, do that instead of manual recovery.
.. warning::
Before proceeding, make sure that the irrecoverable nodes are truly dead, and not, for example, temporarily partitioned away due to a network failure. If it is possible for the 'dead' nodes to come back to life, they might communicate and interfere with the recovery procedure and cause unpredictable problems.
If you have no means of ensuring that these irrecoverable nodes won't come back to life and communicate with the rest of the cluster, setup firewall rules or otherwise isolate your alive nodes to reject any communication attempts from these dead nodes.
During the manual recovery procedure you'll enter a special ``RECOVERY`` mode, remove all faulty nodes (using the standard :doc:`node removal procedure </operating-scylla/procedures/cluster-management/remove-node/>`), delete the internal Raft data, and restart the cluster. This will cause the cluster to perform the internal Raft upgrade procedure again, initializing the Raft algorithm from scratch. The manual recovery procedure is applicable both to clusters which were not running Raft in the past and then had Raft enabled, and to clusters which were bootstrapped using Raft.
.. warning::
Entering ``RECOVERY`` mode requires a node restart. Restarting an additional node while some nodes are already dead may lead to unavailability of data queries (assuming that you haven't lost it already). For example, if you're using the standard RF=3, CL=QUORUM setup, and you're recovering from a stuck of upgrade procedure because one of your nodes is dead, restarting another node will cause temporary data query unavailability (until the node finishes restarting). Prepare your service for downtime before proceeding.
#. Perform the following query on **every alive node** in the cluster, using e.g. ``cqlsh``:
.. code-block:: cql
cqlsh> UPDATE system.scylla_local SET value = 'recovery' WHERE key = 'group0_upgrade_state';
#. Perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>` of your alive nodes.
#. Verify that all the nodes have entered ``RECOVERY`` mode when restarting; look for one of the following messages in their logs:
.. code-block:: console
group0_client - RECOVERY mode.
raft_group0 - setup_group0: Raft RECOVERY mode, skipping group 0 setup.
raft_group0_upgrade - RECOVERY mode. Not attempting upgrade.
#. Remove all your dead nodes using the :doc:`node removal procedure </operating-scylla/procedures/cluster-management/remove-node/>`.
#. Remove existing Raft cluster data by performing the following queries on **every alive node** in the cluster, using e.g. ``cqlsh``:
.. code-block:: cql
cqlsh> TRUNCATE TABLE system.discovery;
cqlsh> TRUNCATE TABLE system.group0_history;
cqlsh> DELETE value FROM system.scylla_local WHERE key = 'raft_group0_id';
#. Make sure that schema is synchronized in the cluster by executing :doc:`nodetool describecluster </operating-scylla/nodetool-commands/describecluster>` on each node and verifying that the schema version is the same on all nodes.
#. We can now leave ``RECOVERY`` mode. On **every alive node**, perform the following query:
.. code-block:: cql
cqlsh> DELETE FROM system.scylla_local WHERE key = 'group0_upgrade_state';
#. Perform a :doc:`rolling restart </operating-scylla/procedures/config-change/rolling-restart/>` of your alive nodes.
#. The Raft upgrade procedure will start anew. :ref:`Verify <verify-raft-procedure>` that it finishes successfully.
- When the DCs come back online, restart the nodes. If at least one DC fails to come back online and the nodes are lost, :doc:`restore the latest cluster backup into a new cluster </operating-scylla/procedures/backup-restore/restore/>`. You can contact `ScyllaDB support <https://www.scylladb.com/product/support/>`_ for assistance.
.. _raft-learn-more:

View File

@@ -13,7 +13,7 @@ sys.path.insert(0, os.path.abspath(".."))
# Build documentation for the following tags and branches
TAGS = []
BRANCHES = ["master", "branch-5.1"]
BRANCHES = ["master"]
# Set the latest version.
LATEST_VERSION = "master"
# Set which versions are not released yet.

View File

@@ -255,9 +255,7 @@ The following options only apply to IncrementalCompactionStrategy:
``space_amplification_goal`` (default: null)
:label-tip:`ScyllaDB Enterprise`
.. versionadded:: 2020.1.6
.. versionadded:: 2020.1.6 Scylla Enterprise
This is a threshold of the ratio of the sum of the sizes of the two largest tiers to the size of the largest tier,
above which ICS will automatically compact the second largest and largest tiers together to eliminate stale data that may have been overwritten, expired, or deleted.

View File

@@ -860,18 +860,6 @@ Other considerations:
- Adding new columns (see ``ALTER TABLE`` below) is a constant time operation. There is thus no need to try to
anticipate future usage when creating a table.
.. _ddl-per-parition-rate-limit:
Limiting the rate of requests per partition
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You can limit the read rates and writes rates into a partition by applying
a ScyllaDB CQL extension to the CREATE TABLE or ALTER TABLE statements.
See `Per-partition rate limit <https://docs.scylladb.com/stable/cql/cql-extensions.html#per-partition-rate-limit>`_
for details.
.. REMOVE IN FUTURE VERSIONS - Remove the URL above (temporary solution) and replace it with a relative link (once the solution is applied).
.. _alter-table-statement:
ALTER TABLE
@@ -930,7 +918,6 @@ The ``ALTER TABLE`` statement can:
The same note applies to the set of ``compression`` sub-options.
- Change or add any of the ``Encryption options`` above.
- Change or add any of the :ref:`CDC options <cdc-options>` above.
- Change or add per-partition rate limits. See :ref:`Limiting the rate of requests per partition <ddl-per-parition-rate-limit>`.
.. warning:: Dropping a column assumes that the timestamps used for the value of this column are "real" timestamp in
microseconds. Using "real" timestamps in microseconds is the default is and is **strongly** recommended, but as
@@ -940,6 +927,7 @@ The ``ALTER TABLE`` statement can:
.. warning:: Once a column is dropped, it is allowed to re-add a column with the same name as the dropped one
**unless** the type of the dropped column was a (non-frozen) column (due to an internal technical limitation).
.. _drop-table-statement:
DROP TABLE

View File

@@ -142,7 +142,7 @@ You can read more about the ``TIMESTAMP`` retrieved by ``WRITETIME`` in the :ref
- ``TTL`` retrieves the remaining time to live (in *seconds*) for the value of the column, if it set to expire, or ``null`` otherwise.
You can read more about TTL in the :doc:`documentation </cql/time-to-live>` and also in `this Scylla University lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`_.
You can read more about TTL in the :doc:`documentation </cql/time-to-live>` and also in `this Scylla University lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`.
.. _where-clause:
@@ -774,7 +774,7 @@ parameters:
the columns themselves. This means that any subsequent update of the column will also reset the TTL (to whatever TTL
is specified in that update). By default, values never expire. A TTL of 0 is equivalent to no TTL. If the table has a
default_time_to_live, a TTL of 0 will remove the TTL for the inserted or updated values. A TTL of ``null`` is equivalent
to inserting with a TTL of 0. You can read more about TTL in the :doc:`documentation </cql/time-to-live>` and also in `this Scylla University lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`_.
to inserting with a TTL of 0. You can read more about TTL in the :doc:`documentation </cql/time-to-live>` and also in `this Scylla University lesson <https://university.scylladb.com/courses/data-modeling/lessons/advanced-data-modeling/topic/expiring-data-with-ttl-time-to-live/>`.
- ``TIMEOUT``: specifies a timeout duration for the specific request.
Please refer to the :ref:`SELECT <using-timeout>` section for more information.

View File

@@ -21,6 +21,7 @@
.. _cql-functions:
.. Need some intro for UDF and native functions in general and point those to it.
.. _udfs:
.. _native-functions:
Functions
@@ -32,15 +33,13 @@ CQL supports two main categories of functions:
- The :ref:`aggregate functions <aggregate-functions>`, which are used to aggregate multiple rows of results from a
``SELECT`` statement.
In both cases, CQL provides a number of native "hard-coded" functions as well as the ability to create new user-defined
functions.
.. In both cases, CQL provides a number of native "hard-coded" functions as well as the ability to create new user-defined
.. functions.
.. note:: Although user-defined functions are sandboxed, protecting the system from a "rogue" function, user-defined functions are disabled by default for extra security.
See the ``enable_user_defined_functions`` in ``scylla.yaml`` to enable them.
Additionally, user-defined functions are still experimental and need to be explicitly enabled by adding ``udf`` to the list of
``experimental_features`` configuration options in ``scylla.yaml``, or turning on the ``experimental`` flag.
See :ref:`Enabling Experimental Features <yaml_enabling_experimental_features>` for details.
.. .. note:: By default, the use of user-defined functions is disabled by default for security concerns (even when
.. enabled, the execution of user-defined functions is sandboxed and a "rogue" function should not be allowed to do
.. evil, but no sandbox is perfect so using user-defined functions is opt-in). See the ``enable_user_defined_functions``
.. in ``scylla.yaml`` to enable them.
.. A function is identifier by its name:
@@ -61,11 +60,11 @@ Native functions
Cast
````
Supported starting from ScyllaDB version 2.1
Supported starting from Scylla version 2.1
The ``cast`` function can be used to convert one native datatype to another.
The following table describes the conversions supported by the ``cast`` function. ScyllaDB will silently ignore any cast converting a cast datatype into its own datatype.
The following table describes the conversions supported by the ``cast`` function. Scylla will silently ignore any cast converting a cast datatype into its own datatype.
=============== =======================================================================================================
From To
@@ -229,65 +228,6 @@ A number of functions are provided to “convert” the native types into binary
takes a 64-bit ``blob`` argument and converts it to a ``bigint`` value. For example, ``bigintAsBlob(3)`` is
``0x0000000000000003`` and ``blobAsBigint(0x0000000000000003)`` is ``3``.
.. _udfs:
User-defined functions :label-caution:`Experimental`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
User-defined functions (UDFs) execute user-provided code in ScyllaDB. Supported languages are currently Lua and WebAssembly.
UDFs are part of the ScyllaDB schema and are automatically propagated to all nodes in the cluster.
UDFs can be overloaded, so that multiple UDFs with different argument types can have the same function name, for example::
CREATE FUNCTION sample ( arg int ) ...;
CREATE FUNCTION sample ( arg text ) ...;
When calling a user-defined function, arguments can be literals or terms. Prepared statement placeholders can be used, too.
CREATE FUNCTION statement
`````````````````````````
Creating a new user-defined function uses the ``CREATE FUNCTION`` statement. For example::
CREATE OR REPLACE FUNCTION div(dividend double, divisor double)
RETURNS NULL ON NULL INPUT
RETURNS double
LANGUAGE LUA
AS 'return dividend/divisor;';
``CREATE FUNCTION`` with the optional ``OR REPLACE`` keywords creates either a function
or replaces an existing one with the same signature. A ``CREATE FUNCTION`` without ``OR REPLACE``
fails if a function with the same signature already exists. If the optional ``IF NOT EXISTS``
keywords are used, the function will only be created only if another function with the same
signature does not exist. ``OR REPLACE`` and ``IF NOT EXISTS`` cannot be used together.
Behavior for null input values must be defined for each function:
* ``RETURNS NULL ON NULL INPUT`` declares that the function will always return null (without being executed) if any of the input arguments is null.
* ``CALLED ON NULL INPUT`` declares that the function will always be executed.
Function Signature
``````````````````
Signatures are used to distinguish individual functions. The signature consists of a fully-qualified function name of the <keyspace>.<function_name> and a concatenated list of all the argument types.
Note that keyspace names, function names and argument types are subject to the default naming conventions and case-sensitivity rules.
Functions belong to a keyspace; if no keyspace is specified, the current keyspace is used. User-defined functions are not allowed in the system keyspaces.
DROP FUNCTION statement
```````````````````````
Dropping a function uses the ``DROP FUNCTION`` statement. For example::
DROP FUNCTION myfunction;
DROP FUNCTION mykeyspace.afunction;
DROP FUNCTION afunction ( int );
DROP FUNCTION afunction ( text );
You must specify the argument types of the function, the arguments_signature, in the drop command if there are multiple overloaded functions with the same name but different signatures.
``DROP FUNCTION`` with the optional ``IF EXISTS`` keywords drops a function if it exists, but does not throw an error if it doesnt.
.. _aggregate-functions:
Aggregate functions
@@ -350,59 +290,6 @@ instance::
.. _user-defined-aggregates-functions:
User-defined aggregates (UDAs) :label-caution:`Experimental`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
User-defined aggregates allow the creation of custom aggregate functions. User-defined aggregates can be used in SELECT statement.
Each aggregate requires an initial state of type ``STYPE`` defined with the ``INITCOND`` value (default value: ``null``). The first argument of the state function must have type STYPE. The remaining arguments of the state function must match the types of the user-defined aggregate arguments. The state function is called once for each row, and the value returned by the state function becomes the new state. After all rows are processed, the optional FINALFUNC is executed with the last state value as its argument.
The ``STYPE`` value is mandatory in order to distinguish possibly overloaded versions of the state and/or final function, since the overload can appear after creation of the aggregate.
A complete working example for user-defined aggregates (assuming that a keyspace has been selected using the ``USE`` statement)::
CREATE FUNCTION accumulate_len(acc tuple<bigint,bigint>, a text)
RETURNS NULL ON NULL INPUT
RETURNS tuple<bigint,bigint>
LANGUAGE lua as 'return {acc[1] + 1, acc[2] + #a}';
CREATE OR REPLACE FUNCTION present(res tuple<bigint,bigint>)
RETURNS NULL ON NULL INPUT
RETURNS text
LANGUAGE lua as
'return "The average string length is " .. res[2]/res[1] .. "!"';
CREATE OR REPLACE AGGREGATE avg_length(text)
SFUNC accumulate_len
STYPE tuple<bigint,bigint>
FINALFUNC present
INITCOND (0,0);
CREATE AGGREGATE statement
``````````````````````````
The ``CREATE AGGREGATE`` command with the optional ``OR REPLACE`` keywords creates either an aggregate or replaces an existing one with the same signature. A ``CREATE AGGREGATE`` without ``OR REPLACE`` fails if an aggregate with the same signature already exists. The ``CREATE AGGREGATE`` command with the optional ``IF NOT EXISTS`` keywords creates an aggregate if it does not already exist. The ``OR REPLACE`` and ``IF NOT EXISTS`` phrases cannot be used together.
The ``STYPE`` value defines the type of the state value and must be specified. The optional ``INITCOND`` defines the initial state value for the aggregate; the default value is null. A non-null ``INITCOND`` must be specified for state functions that are declared with ``RETURNS NULL ON NULL INPUT``.
The ``SFUNC`` value references an existing function to use as the state-modifying function. The first argument of the state function must have type ``STYPE``. The remaining arguments of the state function must match the types of the user-defined aggregate arguments. The state function is called once for each row, and the value returned by the state function becomes the new state. State is not updated for state functions declared with ``RETURNS NULL ON NULL INPUT`` and called with null. After all rows are processed, the optional ``FINALFUNC`` is executed with last state value as its argument. It must take only one argument with type ``STYPE``, but the return type of the ``FINALFUNC`` may be a different type. A final function declared with ``RETURNS NULL ON NULL INPUT`` means that the aggregates return value will be null, if the last state is null.
If no ``FINALFUNC`` is defined, the overall return type of the aggregate function is ``STYPE``. If a ``FINALFUNC`` is defined, it is the return type of that function.
DROP AGGREGATE statement
````````````````````````
Dropping an user-defined aggregate function uses the DROP AGGREGATE statement. For example::
DROP AGGREGATE myAggregate;
DROP AGGREGATE myKeyspace.anAggregate;
DROP AGGREGATE someAggregate ( int );
DROP AGGREGATE someAggregate ( text );
The ``DROP AGGREGATE`` statement removes an aggregate created using ``CREATE AGGREGATE``. You must specify the argument types of the aggregate to drop if there are multiple overloaded aggregates with the same name but a different signature.
The ``DROP AGGREGATE`` command with the optional ``IF EXISTS`` keywords drops an aggregate if it exists, and does nothing if a function with the signature does not exist.
.. include:: /rst_include/apache-cql-return-index.rst
.. include:: /rst_include/apache-copyrights.rst
.. include:: /rst_include/apache-copyrights.rst

View File

@@ -6,10 +6,8 @@ System Requirements
Supported Platforms
===================
ScyllaDB runs on 64-bit Linux. The x86_64 and AArch64 architectures are supported (AArch64 support includes AWS EC2 Graviton).
See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about
supported operating systems, distros, and versions.
Scylla runs on 64-bit Linux. Here, you can find which :doc:`operating systems, distros, and versions </getting-started/os-support>` are supported.
.. _system-requirements-hardware:
@@ -18,44 +16,39 @@ Hardware Requirements
Its recommended to have a balanced setup. If there are only 4-8 :term:`Logical Cores <Logical Core (lcore)>`, large disks or 10Gbps networking may not be needed.
This works in the opposite direction as well.
ScyllaDB can be used in many types of installation environments.
Scylla can be used in many types of installation environments.
To see which system would best suit your workload requirements, use the `ScyllaDB Sizing Calculator <https://price-calc.gh.scylladb.com/>`_ to customize ScyllaDB for your usage.
To see which system would best suit your workload requirements, use the `Scylla Sizing Calculator <https://price-calc.gh.scylladb.com/>`_ to customize Scylla for your usage.
Core Requirements
-----------------
ScyllaDB tries to maximize the resource usage of all system components. The shard-per-core approach allows linear scale-up with the number of cores. As you have more cores, it makes sense to balance the other resources, from memory to network.
Scylla tries to maximize the resource usage of all system components. The shard-per-core approach allows linear scale-up with the number of cores. As you have more cores, it makes sense to balance the other resources, from memory to network.
CPU
^^^
Scylla requires modern Intel CPUs that support the SSE4.2 instruction set and will not boot without it.
ScyllaDB requires modern Intel/AMD CPUs that support the SSE4.2 instruction set and will not boot without it.
The following CPUs are supported by Scylla:
* Intel core: Westmere or later (2010)
* Intel atom: Goldmont or later (2016)
* AMD low power: Jaguar or later (2013)
* AMD standard: Bulldozer or later (2011)
ScyllaDB supports the following CPUs:
* Intel core: Westmere and later (2010)
* Intel atom: Goldmont and later (2016)
* AMD low power: Jaguar and later (2013)
* AMD standard: Bulldozer and later (2011)
* Apple M1 and M2
* Ampere Altra
* AWS Graviton, Graviton2, Graviton3
In terms of the number of cores, any number will work since ScyllaDB scales up with the number of cores.
In terms of the number of cores, any number will work since Scylla scales up with the number of cores.
A practical approach is to use a large number of cores as long as the hardware price remains reasonable.
Between 20-60 logical cores (including hyperthreading) is a recommended number. However, any number will fit.
When using virtual machines, containers, or the public cloud, remember that each virtual CPU is mapped to a single logical core, or thread.
Allow ScyllaDB to run independently without any additional CPU intensive tasks on the same server/cores as Scylla.
Allow Scylla to run independently without any additional CPU intensive tasks on the same server/cores as Scylla.
.. _system-requirements-memory:
Memory Requirements
-------------------
The more memory available, the better ScyllaDB performs, as ScyllaDB uses all of the available memory for caching. The wider the rows are in the schema, the more memory will be required. 64 GB-256 GB is the recommended range for a medium to high workload. Memory requirements are calculated based on the number of :abbr:`lcores (logical cores)` you are using in your system.
The more memory available, the better Scylla performs, as Scylla uses all of the available memory for caching. The wider the rows are in the schema, the more memory will be required. 64 GB-256 GB is the recommended range for a medium to high workload. Memory requirements are calculated based on the number of :abbr:`lcores (logical cores)` you are using in your system.
* Recommended size: 16 GB or 2GB per lcore (whichever is higher)
* Maximum: 1 TiB per lcore, up to 256 lcores
@@ -71,7 +64,7 @@ Disk Requirements
SSD
^^^
We highly recommend SSD and local disks. ScyllaDB is built for a large volume of data and large storage per node.
We highly recommend SSD and local disks. Scylla is built for a large volume of data and large storage per node.
You can use up to 100:1 Disk/RAM ratio, with 30:1 Disk/RAM ratio as a good rule of thumb; for example, 30 TB of storage requires 1 TB of RAM.
We recommend a RAID-0 setup and a replication factor of 3 within the local datacenter (RF=3) when there are multiple drives.
@@ -81,7 +74,7 @@ HDDs are supported but may become a bottleneck. Some workloads may work with HDD
Disk Space
^^^^^^^^^^
ScyllaDB is flushing memtables to SSTable data files for persistent storage. SSTables are periodically compacted to improve performance by merging and rewriting data and discarding the old one. Depending on compaction strategy, disk space utilization temporarily increases during compaction. For this reason, you should leave an adequate amount of free disk space available on a node.
Scylla is flushing memtables to SSTable data files for persistent storage. SSTables are periodically compacted to improve performance by merging and rewriting data and discarding the old one. Depending on compaction strategy, disk space utilization temporarily increases during compaction. For this reason, you should leave an adequate amount of free disk space available on a node.
Use the following table as a guidelines for the minimum disk space requirements based on the compaction strategy:
====================================== =========== ============
@@ -96,7 +89,7 @@ Time-window Compaction Strategy (TWCS) 50% 70%
Incremental Compaction Strategy (ICS) 70% 80%
====================================== =========== ============
Use the default ICS (ScyllaDB Enterprise) or STCS (ScyllaDB Open Source) unless you'll have a clear understanding that another strategy is better for your use case. More on :doc:`choosing a Compaction Strategy </architecture/compaction/compaction-strategies>`.
Use the default ICS (Scylla Enterprise) or STCS (Scylla Open Source) unless you'll have a clear understanding that another strategy is better for your use case. More on :doc:`choosing a Compaction Strategy </architecture/compaction/compaction-strategies>`.
In order to maintain a high level of service availability, keep 50% to 20% free disk space at all times!
.. _system-requirements-network:
@@ -104,7 +97,7 @@ In order to maintain a high level of service availability, keep 50% to 20% free
Network Requirements
====================
A network speed of 10 Gbps or more is recommended, especially for large nodes. To tune the interrupts and their queues, run the ScyllaDB setup scripts.
A network speed of 10 Gbps or more is recommended, especially for large nodes. To tune the interrupts and their queues, run the Scylla setup scripts.
Cloud Instance Recommendations
@@ -113,25 +106,20 @@ Cloud Instance Recommendations
Amazon Web Services (AWS)
--------------------------------
* The recommended instance types are :ref:`i3 <system-requirements-i3-instances>`, :ref:`i3en <system-requirements-i3en-instances>`, and :ref:`i4i <system-requirements-i4i-instances>`.
* We recommend using enhanced networking that exposes the physical network cards to the VM.
.. note::
Some of the ScyllaDB configuration features rely on querying instance metadata.
Disabling access to instance metadata will impact using Ec2 Snitches and tuning performance.
See `AWS - Configure the instance metadata options <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-options.html>`_ for more information.
.. _system-requirements-i3-instances:
i3 instances
^^^^^^^^^^^^
This family includes the High Storage Instances that provide very fast SSD-backed instance storage optimized for very high random I/O performance and provide high IOPS at a low cost. We recommend using enhanced networking that exposes the physical network cards to the VM.
We highly recommend EC2 **I3** instances—High I/O. This family includes the High Storage Instances that provide very fast SSD-backed instance storage optimized for very high random I/O performance and provide high IOPS at a low cost. We recommend using enhanced networking that exposes the physical network cards to the VM.
i3 instances are designed for I/O intensive workloads and equipped with super-efficient NVMe SSD storage. It can deliver up to 3.3 Million IOPS.
An i3 instance is great for low latency and high throughput, compared to the i2 instances, the i3 instance provides storage that it's less expensive and denser along with the ability to deliver substantially more IOPS and more network bandwidth per CPU core.
i3 instances
^^^^^^^^^^^^
=========================== =========== ============ =====================
Model vCPU Mem (GB) Storage (NVMe SSD)
=========================== =========== ============ =====================
@@ -152,15 +140,13 @@ i3.metal New in version 2.3 72 :sup:`*` 512 8 x 1.9 NVMe SSD
Source: `Amazon EC2 I3 Instances <https://aws.amazon.com/ec2/instance-types/i3/>`_
More on using ScyllaDB with `i3.metal vs i3.16xlarge <https://www.scylladb.com/2018/06/21/impact-virtualization-database/>`_
.. _system-requirements-i3en-instances:
More on using Scylla with `i3.metal vs i3.16xlarge <https://www.scylladb.com/2018/06/21/impact-virtualization-database/>`_
i3en instances
^^^^^^^^^^^^^^
i3en instances have up to 4x the networking bandwidth of i3 instances, enabling up to 100 Gbps of sustained network bandwidth.
i3en support is available for ScyllaDB Enterprise 2019.1.1 and higher and ScyllaDB Open Source 3.1 and higher.
i3en support is available for Scylla Enterprise 2019.1.1 and higher and Scylla Open Source 3.1 and higher.
=========================== =========== ============ =====================
@@ -191,12 +177,12 @@ All i3en instances have the following specs:
See `Amazon EC2 I3en Instances <https://aws.amazon.com/ec2/instance-types/i3en/>`_ for details.
.. _system-requirements-i4i-instances:
i4i instances
^^^^^^^^^^^^^^
i4i support is available for ScyllaDB Open Source 5.0 and later and ScyllaDB Enterprise 2021.1.10 and later.
=========================== =========== ============ =====================
Model vCPU Mem (GB) Storage (NVMe SSD)
=========================== =========== ============ =====================
@@ -217,7 +203,7 @@ i4i.32xlarge 128 1,024 8 x 3,750 GB
i4i.metal 128 1,024 8 x 3,750 GB
=========================== =========== ============ =====================
All i4i instances have the following specs:
All i41 instances have the following specs:
* 3.5 GHz all-core turbo Intel® Xeon® Scalable (Ice Lake) processors
* 40 Gbps bandwidth to EBS in the largest size and up to 10 Gbps in the four smallest sizes (twice that of i3 instances. Up to 75 Gbps networking bandwidth (three times more than I3 instances).
@@ -230,15 +216,11 @@ See `ScyllaDB on the New AWS EC2 I4i Instances: Twice the Throughput & Lower Lat
learn more about using ScyllaDB with i4i instances.
Im4gn and Is4gen instances
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ScyllaDB supports Arm-based Im4gn and Is4gen instances. See `Amazon EC2 Im4gn and Is4gen instances <https://aws.amazon.com/ec2/instance-types/i4g/>`_ for specification details.
Google Compute Engine (GCE)
-----------------------------------
Pick a zone where Haswell CPUs are found. Local SSD performance offers, according to Google, less than 1 ms of latency and up to 680,000 read IOPS and 360,000 write IOPS.
Image with NVMe disk interface is recommended, CentOS 7 for ScyllaDB Enterprise 2020.1 and older, and Ubuntu 20 for 2021.1 and later.
Image with NVMe disk interface is recommended, CentOS 7 for Scylla Enterprise 2020.1 and older, and Ubuntu 20 for 2021.1 and later.
(`More info <https://cloud.google.com/compute/docs/disks/local-ssd>`_)
Recommended instances types are `n1-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines>`_ and `n2-highmem <https://cloud.google.com/compute/docs/general-purpose-machines#n2_machines>`_

View File

@@ -25,31 +25,29 @@
<div class="grid-x grid-margin-x hs">
.. topic-box::
:title: New to ScyllaDB? Start here!
:link: https://cloud.docs.scylladb.com/stable/scylladb-basics/
:class: large-4
:anchor: ScyllaDB Basics
Learn the essentials of ScyllaDB.
.. topic-box::
:title: Let us manage your DB
:link: https://cloud.docs.scylladb.com
:class: large-4
:anchor: ScyllaDB Cloud Documentation
:anchor: Get Started with Scylla Cloud
Simplify application development with ScyllaDB Cloud - a fully managed database-as-a-service.
Take advantage of Scylla Cloud, a fully-managed database-as-a-service.
.. topic-box::
:title: Manage your own DB
:link: getting-started
:class: large-4
:anchor: ScyllaDB Open Source and Enterprise Documentation
:anchor: Get Started with Scylla
Deploy and manage your database in your own environment.
Provision and manage a Scylla cluster in your environment.
.. topic-box::
:title: Connect your application to Scylla
:link: using-scylla/drivers
:class: large-4
:anchor: Choose a Driver
Use high performance Scylla drivers to connect your application to a Scylla cluster.
.. raw:: html
@@ -59,13 +57,14 @@
<div class="topics-grid topics-grid--products">
<h2 class="topics-grid__title">Our Products</h2>
<h2 class="topics-grid__title">Our Product List</h2>
<p class="topics-grid__text">To begin choose a product from the list below</p>
<div class="grid-container full">
<div class="grid-x grid-margin-x">
.. topic-box::
:title: ScyllaDB Enterprise
:title: Scylla Enterprise
:link: getting-started
:image: /_static/img/mascots/scylla-enterprise.svg
:class: topic-box--product,large-3,small-6
@@ -73,7 +72,7 @@
ScyllaDBs most stable high-performance enterprise-grade NoSQL database.
.. topic-box::
:title: ScyllaDB Open Source
:title: Scylla Open Source
:link: getting-started
:image: /_static/img/mascots/scylla-opensource.svg
:class: topic-box--product,large-3,small-6
@@ -81,15 +80,15 @@
A high-performance NoSQL database with a close-to-the-hardware, shared-nothing approach.
.. topic-box::
:title: ScyllaDB Cloud
:title: Scylla Cloud
:link: https://cloud.docs.scylladb.com
:image: /_static/img/mascots/scylla-cloud.svg
:class: topic-box--product,large-3,small-6
A fully managed NoSQL database as a service powered by ScyllaDB Enterprise.
A fully managed NoSQL database as a service powered by Scylla Enterprise.
.. topic-box::
:title: ScyllaDB Alternator
:title: Scylla Alternator
:link: https://docs.scylladb.com/stable/alternator/alternator.html
:image: /_static/img/mascots/scylla-alternator.svg
:class: topic-box--product,large-3,small-6
@@ -97,23 +96,23 @@
Open source Amazon DynamoDB-compatible API.
.. topic-box::
:title: ScyllaDB Monitoring Stack
:title: Scylla Monitoring Stack
:link: https://monitoring.docs.scylladb.com
:image: /_static/img/mascots/scylla-monitor.svg
:class: topic-box--product,large-3,small-6
Complete open source monitoring solution for your ScyllaDB clusters.
Complete open source monitoring solution for your Scylla clusters.
.. topic-box::
:title: ScyllaDB Manager
:title: Scylla Manager
:link: https://manager.docs.scylladb.com
:image: /_static/img/mascots/scylla-manager.svg
:class: topic-box--product,large-3,small-6
Hassle-free ScyllaDB NoSQL database management for scale-out clusters.
Hassle-free Scylla NoSQL database management for scale-out clusters.
.. topic-box::
:title: ScyllaDB Drivers
:title: Scylla Drivers
:link: https://docs.scylladb.com/stable/using-scylla/drivers/
:image: /_static/img/mascots/scylla-drivers.svg
:class: topic-box--product,large-3,small-6
@@ -121,12 +120,12 @@
Shard-aware drivers for superior performance.
.. topic-box::
:title: ScyllaDB Operator
:title: Scylla Operator
:link: https://operator.docs.scylladb.com
:image: /_static/img/mascots/scylla-enterprise.svg
:class: topic-box--product,large-3,small-6
Easily run and manage your ScyllaDB cluster on Kubernetes.
Easily run and manage your Scylla Cluster on Kubernetes.
.. raw:: html
@@ -136,19 +135,19 @@
<div class="topics-grid">
<h2 class="topics-grid__title">Learn More About ScyllaDB</h2>
<h2 class="topics-grid__title">Learn More About Scylla</h2>
<p class="topics-grid__text"></p>
<div class="grid-container full">
<div class="grid-x grid-margin-x">
.. topic-box::
:title: Attend ScyllaDB University
:title: Attend Scylla University
:link: https://university.scylladb.com/
:image: /_static/img/mascots/scylla-university.png
:class: large-6,small-12
:anchor: Find a Class
| Register to take a *free* class at ScyllaDB University.
| Register to take a *free* class at Scylla University.
| There are several learning paths to choose from.
.. topic-box::
@@ -179,9 +178,9 @@
architecture/index
troubleshooting/index
kb/index
ScyllaDB University <https://university.scylladb.com/>
Scylla University <https://university.scylladb.com/>
faq
Contribute to ScyllaDB <contribute>
Contribute to Scylla <contribute>
glossary
alternator/alternator

View File

@@ -29,7 +29,7 @@ There are two types of compactions:
* Major Compaction
A user triggers (using nodetool) a compaction over all SSTables, merging the individual tables according to the selected compaction strategy.
.. caution:: It is always best to allow Scylla to automatically run minor compactions. Major compactions can exhaust resources, increase operational costs, and take up valuable disk space. This requires you to have 50% more disk space than your data unless you are using :ref:`Incremental compaction strategy (ICS) <incremental-compaction-strategy-ics>`.
.. caution:: It is always best to allow Scylla to automatically run minor compactions. Major compactions can exhaust resources, increase operational costs, and take up valuable disk space. This requires you to have 50% more disk space than your data unless you are using `Incremental compaction strategy (ICS)`_.
View Compaction Statistics
--------------------------
@@ -43,7 +43,7 @@ A compaction strategy is what determines which of the SSTables will be compacted
* `Size-tiered compaction strategy (STCS)`_ - (default setting) triggered when the system has enough similarly sized SSTables.
* `Leveled compaction strategy (LCS)`_ - the system uses small, fixed-size (by default 160 MB) SSTables divided into different levels and lowers both Read and Space Amplification.
* :ref:`Incremental compaction strategy (ICS) <incremental-compaction-strategy-ics>` - :label-tip:`ScyllaDB Enterprise` Uses runs of sorted, fixed size (by default 1 GB) SSTables in a similar way that LCS does, organized into size-tiers, similar to STCS size-tiers. If you are an Enterprise customer ICS is an updated strategy meant to replace STCS. It has the same read and write amplification, but has lower space amplification due to the reduction of temporary space overhead is reduced to a constant manageable level.
* `Incremental compaction strategy (ICS)`_ - Available for Enterprise customers, uses runs of sorted, fixed size (by default 1 GB) SSTables in a similar way that LCS does, organized into size-tiers, similar to STCS size-tiers. If you are an Enterprise customer ICS is an updated strategy meant to replace STCS. It has the same read and write amplification, but has lower space amplification due to the reduction of temporary space overhead is reduced to a constant manageable level.
* `Time-window compaction strategy (TWCS)`_ - designed for time series data and puts data in time order. This strategy replaced Date-tiered compaction. TWCS uses STCS to prevent accumulating SSTables in a window not yet closed. When the window closes, TWCS works towards reducing the SSTables in a time window to one.
* `Date-tiered compaction strategy (DTCS)`_ - designed for time series data, but TWCS should be used instead.
@@ -116,10 +116,12 @@ Likewise, when :term:`bootstrapping<Bootstrap>` a new node, SSTables are streame
.. _incremental-compaction-strategy-ics:
Incremental Compaction Strategy (ICS) :label-tip:`ScyllaDB Enterprise`
------------------------------------------------------------------------
Incremental Compaction Strategy (ICS)
-------------------------------------
.. versionadded:: 2019.1.4
.. versionadded:: 2019.1.4 Scylla Enterprise
.. include:: /rst_include/enterprise-only-note.rst
One of the issues with Size-tiered compaction is that it needs temporary space because SSTables are not removed until they are fully compacted. ICS takes a different approach and splits each large SSTable into a run of sorted, fixed-size (by default 1 GB) SSTables (a.k.a. fragments) in the same way that LCS does, except it treats the entire run and not the individual SSTables as the sizing file for STCS. As the run-fragments are small, the SSTables compact quickly, allowing individual SSTables to be removed as soon as they are compacted. This approach uses low amounts of memory and temporary disk space.

View File

@@ -42,7 +42,7 @@ Steps:
.. code-block:: sh
nodetool compact <keyspace> <mytable>;
nodetool compact <keyspace>.<mytable>;
5. Alter the table and change the grace period back to the original ``gc_grace_seconds`` value.

View File

@@ -27,7 +27,8 @@ endpoint_snitch GossipingPropertyFileSnitch
**Important**
If the node has two physical network interfaces in a multi-datacenter installation, set ``listen_address`` to this node's private IP or hostname.
If the node has two physical network interfaces in a multi-datacenter installation.
Set ``listen_address`` to this node's private IP or hostname.
Set ``broadcast_address`` to the second IP or hostname (for communication between data centers).
Set ``listen_on_broadcast_address`` to true.
Open the storage_port or ssl_storage_port on the public IP firewall.

View File

@@ -102,4 +102,4 @@ Cluster Management Procedures
Procedures for handling failures and practical examples of different scenarios.
* :ref:`Handling Failures<raft-handling-failures>`
* :ref:`Handling Failures<raft-handliing-failures>`

View File

@@ -2,18 +2,12 @@
Scylla Auditing Guide
=====================
:label-tip:`ScyllaDB Enterprise`
.. include:: /rst_include/enterprise-only-note.rst
Auditing allows the administrator to monitor activities on a Scylla cluster, including queries and data changes.
The information is stored in a Syslog or a Scylla table.
Prerequisite
------------
Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>` and :doc:`Authorization </operating-scylla/security/enable-authorization>`.
Enabling Audit
---------------

View File

@@ -2,13 +2,11 @@
Encryption at Rest
==================
:label-tip:`ScyllaDB Enterprise`
.. versionadded:: 2019.1.1
.. versionchanged:: 2019.1.3
.. versionadded:: 2019.1.1 Scylla Enterprise
.. versionchanged:: 2019.1.3 Scylla Enterprise
Introduction
=============
.. include:: /rst_include/enterprise-only-note.rst
Scylla Enterprise protects your sensitive data with data-at-rest encryption.
It protects the privacy of your user's data, reduces the risk of data breaches, and helps meet regulatory requirements.

View File

@@ -7,9 +7,9 @@ LDAP Authentication
saslauthd
:label-tip:`ScyllaDB Enterprise`
.. include:: /rst_include/enterprise-only-note.rst
.. versionadded:: 2021.1.2
.. versionadded:: Scylla Enterprise 2021.1.2
Scylla supports user authentication via an LDAP server by leveraging the SaslauthdAuthenticator.
By configuring saslauthd correctly against your LDAP server, you enable Scylla to check the users credentials through it.

View File

@@ -2,14 +2,14 @@
LDAP Authorization (Role Management)
=====================================
:label-tip:`ScyllaDB Enterprise`
.. versionadded:: 2021.1.2
.. include:: /rst_include/enterprise-only-note.rst
Scylla Enterprise customers can manage and authorize users privileges via an :abbr:`LDAP (Lightweight Directory Access Protocol)` server.
LDAP is an open, vendor-neutral, industry-standard protocol for accessing and maintaining distributed user access control over a standard IP network.
If your users are already stored in an LDAP directory, you can now use the same LDAP server to regulate their roles in Scylla.
.. versionadded:: Scylla Enterprise 2021.1.2
Introduction
------------

View File

@@ -0,0 +1 @@
.. note:: This feature is only available with Scylla Enterprise. If you are using Scylla Open Source, this feature will not be available.

View File

@@ -1,6 +1,6 @@
=========================
Troubleshooting ScyllaDB
=========================
======================
Troubleshooting Scylla
======================
.. toctree::
:hidden:
@@ -8,7 +8,6 @@ Troubleshooting ScyllaDB
support/index
startup/index
upgrade/index
cluster/index
modeling/index
storage/index
@@ -25,14 +24,13 @@ Keep your versions up-to-date. The two latest versions are supported. Also, alwa
:id: "getting-started"
:class: my-panel
* :doc:`Errors and ScyllaDB Customer Support <support/index>`
* :doc:`ScyllaDB Startup <startup/index>`
* :doc:`ScyllaDB Cluster and Node <cluster/index>`
* :doc:`ScyllaDB Upgrade <upgrade/index>`
* :doc:`Errors and Scylla Customer Support <support/index>`
* :doc:`Scylla Startup <startup/index>`
* :doc:`Scylla Cluster and Node <cluster/index>`
* :doc:`Data Modeling <modeling/index>`
* :doc:`Data Storage and SSTables <storage/index>`
* :doc:`CQL errors <CQL/index>`
* :doc:`ScyllaDB Monitoring and Scylla Manager <monitor/index>`
* :doc:`Scylla Monitoring and Scylla Manager <monitor/index>`
Also check out the `Monitoring lesson <https://university.scylladb.com/courses/scylla-operations/lessons/scylla-monitoring/>`_ on Scylla University, which covers how to troubleshoot different issues when running a Scylla cluster.

View File

@@ -1,79 +0,0 @@
Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade
======================================================================================
Problem
^^^^^^^
When you reboot the machine after a ScyllaDB upgrade, you cannot access data directories under ``/var/lib/scylla``, and
coredump saves to ``rootfs``.
The problem may occur when you upgrade ScylaDB Open Source 4.6 or later to a version of ScyllaDB Enterprise if
the ``/etc/systemd/system/var-lib-scylla.mount`` and ``/etc/systemd/system/var-lib-systemd-coredump.mount`` are
deleted by RPM.
To avoid losing the files, the upgrade procedure includes a step to backup the .mount files. The following
example shows the command to backup the files before the :doc:`upgrade from version 5.0 </upgrade/upgrade-to-enterprise/upgrade-guide-from-5.0-to-2022.1/upgrade-guide-from-5.0-to-2022.1-rpm/>`:
.. code-block:: console
for conf in $( rpm -qc $(rpm -qa | grep scylla) | grep -v contains ) /etc/systemd/system/{var-lib-scylla,var-lib-systemd-coredump}.mount; do sudo cp -v $conf $conf.backup-5.0; done
If you don't backup the .mount files before the upgrade, the files may be lost.
Solution
^^^^^^^^
If you didn't backup the .mount files before the upgrade and the files were deleted during the upgrade,
you need to restore them manually.
To restore ``/etc/systemd/system/var-lib-systemd-coredump.mount``, run the following:
.. code-block:: console
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-systemd-coredump.mount
[Unit]
Description=Save coredump to scylla data directory
Conflicts=umount.target
Before=scylla-server.service
After=local-fs.target
DefaultDependencies=no
[Mount]
What=/var/lib/scylla/coredump
Where=/var/lib/systemd/coredump
Type=none
Options=bind
[Install]
WantedBy=multi-user.target
EOS
To restore ``/etc/systemd/system/var-lib-scylla.mount``, run the following (specifying your data disk):
.. code-block:: console
$ UUID=`blkid -s UUID -o value <specify your data disk, eg: /dev/md0>`
$ cat << EOS | sudo tee /etc/systemd/system/var-lib-scylla.mount
[Unit]
Description=Scylla data directory
Before=scylla-server.service
After=local-fs.target
DefaultDependencies=no
[Mount]
What=/dev/disk/by-uuid/$UUID
Where=/var/lib/scylla
Type=xfs
Options=noatime
[Install]
WantedBy=multi-user.target
EOS
After restoring .mount files, you need to enable them:
.. code-block:: console
$ sudo systemctl daemon-reload
$ sudo systemctl enable --now var-lib-scylla.mount
$ sudo systemctl enable --now var-lib-systemd-coredump.mount
.. include:: /troubleshooting/_common/ts-return.rst

View File

@@ -1,16 +0,0 @@
Upgrade
=================
.. toctree::
:hidden:
:maxdepth: 2
Inaccessible configuration files after ScyllaDB upgrade </troubleshooting/missing-dotmount-files>
.. panel-box::
:title: Upgrade Issues
:id: "getting-started"
:class: my-panel
* :doc:`Inaccessible "/var/lib/scylla" and "/var/lib/systemd/coredump" after ScyllaDB upgrade </troubleshooting//missing-dotmount-files>`

View File

@@ -5,8 +5,7 @@ Upgrade ScyllaDB Open Source
.. toctree::
:hidden:
ScyllaDB 5.1 to 5.2 <upgrade-guide-from-5.1-to-5.2/index>
ScyllaDB 5.0 to 5.1 <upgrade-guide-from-5.0-to-5.1/index>
ScyllaDB 5.1 to 5.1 <upgrade-guide-from-5.0-to-5.1/index>
ScyllaDB 5.x maintenance release <upgrade-guide-from-5.x.y-to-5.x.z/index>
ScyllaDB 4.6 to 5.0 <upgrade-guide-from-4.6-to-5.0/index>
ScyllaDb 4.5 to 4.6 <upgrade-guide-from-4.5-to-4.6/index>
@@ -37,7 +36,6 @@ Upgrade ScyllaDB Open Source
Procedures for upgrading to a newer version of ScyllaDB Open Source.
* :doc:`Upgrade Guide - ScyllaDB 5.1 to 5.2 <upgrade-guide-from-5.1-to-5.2/index>`
* :doc:`Upgrade Guide - ScyllaDB 5.0 to 5.1 <upgrade-guide-from-5.0-to-5.1/index>`
* :doc:`Upgrade Guide - ScyllaDB 5.x maintenance releases <upgrade-guide-from-5.x.y-to-5.x.z/index>`
* :doc:`Upgrade Guide - ScyllaDB 4.6 to 5.0 <upgrade-guide-from-4.6-to-5.0/index>`

View File

@@ -6,7 +6,10 @@ Upgrade Guide - ScyllaDB 5.0 to 5.1
:maxdepth: 2
:hidden:
ScyllaDB <upgrade-guide-from-5.0-to-5.1-generic>
ScyllaDB Image <upgrade-guide-from-5.0-to-5.1-image>
Red Hat Enterprise Linux and CentOS <upgrade-guide-from-5.0-to-5.1-rpm>
Ubuntu <upgrade-guide-from-5.0-to-5.1-ubuntu>
Debian <upgrade-guide-from-5.0-to-5.1-debian>
Metrics <metric-update-5.0-to-5.1>
.. panel-box::
@@ -17,5 +20,8 @@ Upgrade Guide - ScyllaDB 5.0 to 5.1
Upgrade guides are available for:
* :doc:`Upgrade ScyllaDB from 5.0.x to 5.1.y <upgrade-guide-from-5.0-to-5.1-generic>`
* :doc:`Upgrade ScyllaDB Image from 5.0.x to 5.1.y <upgrade-guide-from-5.0-to-5.1-image>`
* :doc:`Upgrade ScyllaDB from 5.0.x to 5.1.y on Red Hat Enterprise Linux and CentOS <upgrade-guide-from-5.0-to-5.1-rpm>`
* :doc:`Upgrade ScyllaDB from 5.0.x to 5.1.y on Ubuntu <upgrade-guide-from-5.0-to-5.1-ubuntu>`
* :doc:`Upgrade ScyllaDB from 5.0.x to 5.1.y on Debian <upgrade-guide-from-5.0-to-5.1-debian>`
* :doc:`ScyllaDB Metrics Update - Scylla 5.0 to 5.1 <metric-update-5.0-to-5.1>`

View File

@@ -0,0 +1,13 @@
.. |OS| replace:: Debian
.. |ROLLBACK| replace:: rollback
.. _ROLLBACK: ./#rollback-procedure
.. |SRC_VERSION| replace:: 5.0
.. |NEW_VERSION| replace:: 5.1
.. |SCYLLA_NAME| replace:: ScyllaDB
.. |PKG_NAME| replace:: scylla
.. |SCYLLA_REPO| replace:: ScyllaDB deb repo
.. _SCYLLA_REPO: https://www.scylladb.com/download/?platform=debian-10&version=scylla-5.1
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - Scylla 5.0 to 5.1
.. _SCYLLA_METRICS: ../metric-update-5.0-to-5.1
.. |UPGRADE_NOTES| replace:: _
.. include:: /upgrade/_common/upgrade-guide-v5-ubuntu-and-debian.rst

View File

@@ -1,359 +0,0 @@
.. |SCYLLA_NAME| replace:: ScyllaDB
.. |SRC_VERSION| replace:: 5.0
.. |NEW_VERSION| replace:: 5.1
.. |DEBIAN_SRC_REPO| replace:: Debian
.. _DEBIAN_SRC_REPO: https://www.scylladb.com/download/?platform=debian-10&version=scylla-5.0
.. |UBUNTU_SRC_REPO| replace:: Ubuntu
.. _UBUNTU_SRC_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.0
.. |SCYLLA_DEB_SRC_REPO| replace:: ScyllaDB deb repo (|DEBIAN_SRC_REPO|_, |UBUNTU_SRC_REPO|_)
.. |SCYLLA_RPM_SRC_REPO| replace:: ScyllaDB rpm repo
.. _SCYLLA_RPM_SRC_REPO: https://www.scylladb.com/download/?platform=centos&version=scylla-5.0
.. |DEBIAN_NEW_REPO| replace:: Debian
.. _DEBIAN_NEW_REPO: https://www.scylladb.com/download/?platform=debian-10&version=scylla-5.1
.. |UBUNTU_NEW_REPO| replace:: Ubuntu
.. _UBUNTU_NEW_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.1
.. |SCYLLA_DEB_NEW_REPO| replace:: ScyllaDB deb repo (|DEBIAN_NEW_REPO|_, |UBUNTU_NEW_REPO|_)
.. |SCYLLA_RPM_NEW_REPO| replace:: ScyllaDB rpm repo
.. _SCYLLA_RPM_NEW_REPO: https://www.scylladb.com/download/?platform=centos&version=scylla-5.1
.. |ROLLBACK| replace:: rollback
.. _ROLLBACK: ./#rollback-procedure
.. |SCYLLA_METRICS| replace:: Scylla Metrics Update - Scylla 5.0 to 5.1
.. _SCYLLA_METRICS: ../metric-update-5.0-to-5.1
=============================================================================
Upgrade Guide - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION|
=============================================================================
This document is a step by step procedure for upgrading from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|, and rollback to version |SRC_VERSION| if required.
This guide covers upgrading Scylla on Red Hat Enterprise Linux (RHEL) 7/8, CentOS 7/8, Debian 10 and Ubuntu 20.04. It also applies when using ScyllaDB official image on EC2, GCP, or Azure; the image is based on Ubuntu 20.04.
See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about supported versions.
Upgrade Procedure
=================
A ScyllaDB upgrade is a rolling procedure which does **not** require full cluster shutdown.
For each of the nodes in the cluster, serially (i.e. one node at a time), you will:
* Check that the cluster's schema is synchronized
* Drain the node and backup the data
* Backup the configuration file
* Stop ScyllaDB
* Download and install new ScyllaDB packages
* Start ScyllaDB
* Validate that the upgrade was successful
Apply the following procedure **serially** on each node. Do not move to the next node before validating that the node you upgraded is up and running the new version.
**During** the rolling upgrade, it is highly recommended:
* Not to use the new |NEW_VERSION| features
* Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
* Not to apply schema changes
.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.
Upgrade Steps
=============
Check the cluster schema
-------------------------
Make sure that all nodes have the schema synchronized before upgrade. The upgrade procedure will fail if there is a schema disagreement between nodes.
.. code:: sh
nodetool describecluster
Drain the nodes and backup the data
-----------------------------------
Before any major procedure, like an upgrade, it is recommended to backup all the data to an external device. In Scylla, backup is done using the ``nodetool snapshot`` command. For **each** node in the cluster, run the following command:
.. code:: sh
nodetool drain
nodetool snapshot
Take note of the directory name that nodetool gives you, and copy all the directories having that name under ``/var/lib/scylla`` to a backup device.
When the upgrade is completed on all nodes, remove the snapshot with the ``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of space.
Backup the configuration file
------------------------------
.. code:: sh
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
Gracefully stop the node
------------------------
.. code:: sh
sudo service scylla-server stop
Download and install the new release
------------------------------------
.. tabs::
.. group-tab:: Debian/Ubuntu
Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
**To upgrade ScyllaDB:**
#. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
#. Install the new ScyllaDB version:
.. code-block:: console
sudo apt-get clean all
sudo apt-get update
sudo apt-get dist-upgrade scylla
Answer y to the first two questions.
.. group-tab:: RHEL/CentOS
Before upgrading, check what version you are running now using ``rpm -qa | grep scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
**To upgrade ScyllaDB:**
#. Update the |SCYLLA_RPM_NEW_REPO|_ to |NEW_VERSION|.
#. Install the new ScyllaDB version:
.. code:: sh
sudo yum clean all
sudo yum update scylla\* -y
.. group-tab:: EC2/GCP/Azure Ubuntu Image
Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
There are two alternative upgrade procedures:
* :ref:`Upgrading ScyllaDB and simultaneously updating 3rd party and OS packages <upgrade-image-recommended-procedure>`. It is recommended if you are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04.
* :ref:`Upgrading ScyllaDB without updating any external packages <upgrade-image-upgrade-guide-regular-procedure>`.
.. _upgrade-image-recommended-procedure:
**To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
#. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
#. Load the new repo:
.. code:: sh
sudo apt-get update
#. Run the following command to update the manifest file:
.. code:: sh
cat scylla-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
Where:
* ``<version>`` - The ScyllaDB version to which you are upgrading ( |NEW_VERSION| ).
* ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
The file is included in the ScyllaDB packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
Example:
.. code:: sh
cat scylla-packages-5.1.2-x86_64.txt | sudo xargs -n1 apt-get install -y
.. note::
Alternatively, you can update the manifest file with the following command:
``sudo apt-get install $(awk '{print $1'} scylla-packages-<version>-<arch>.txt) -y``
.. _upgrade-image-upgrade-guide-regular-procedure:
**To upgrade ScyllaDB:**
#. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
#. Install the new ScyllaDB version:
.. code-block:: console
sudo apt-get clean all
sudo apt-get update
sudo apt-get dist-upgrade scylla
Answer y to the first two questions.
Start the node
--------------
.. code:: sh
sudo service scylla-server start
Validate
--------
#. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in ``UN`` status.
#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version. Validate that the version matches the one you upgraded to.
#. Check scylla-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no new errors in the log.
#. Check again after two minutes, to validate no new issues are introduced.
Once you are sure the node upgrade was successful, move to the next node in the cluster.
See |Scylla_METRICS|_ for more information..
Rollback Procedure
==================
.. include:: /upgrade/_common/warning_rollback.rst
The following procedure describes a rollback from |SCYLLA_NAME| |NEW_VERSION|.x to |SRC_VERSION|.y. Apply this procedure if an upgrade from |SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes. Use this procedure only for nodes you upgraded to |NEW_VERSION|.
ScyllaDB rollback is a rolling procedure which does **not** require full cluster shutdown.
For each of the nodes you rollback to |SRC_VERSION|, serially (i.e. one node at a time), you will:
* Drain the node and stop Scylla
* Retrieve the old ScyllaDB packages
* Restore the configuration file
* Restore system tables
* Reload systemd configuration
* Restart ScyllaDB
* Validate the rollback success
Apply the following procedure **serially** on each node. Do not move to the next node before validating that the rollback was successful and the node is up and running the old version.
Rollback Steps
==============
Drain and gracefully stop the node
----------------------------------
.. code:: sh
nodetool drain
sudo service scylla-server stop
Download and install the old release
------------------------------------
..
TODO: downgrade for 3rd party packages in EC2/GCP/Azure - like in the upgrade section?
.. tabs::
.. group-tab:: Debian/Ubuntu
#. Remove the old repo file.
.. code:: sh
sudo rm -rf /etc/apt/sources.list.d/scylla.list
#. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
#. Install:
.. code-block::
sudo apt-get update
sudo apt-get remove scylla\* -y
sudo apt-get install scylla
Answer y to the first two questions.
.. group-tab:: RHEL/CentOS
#. Remove the old repo file.
.. code:: sh
sudo rm -rf /etc/yum.repos.d/scylla.repo
#. Update the |SCYLLA_RPM_SRC_REPO|_ to |SRC_VERSION|.
#. Install:
.. code:: console
sudo yum clean all
sudo rm -rf /var/cache/yum
sudo yum remove scylla\\*tools-core
sudo yum downgrade scylla\\* -y
sudo yum install scylla
.. group-tab:: EC2/GCP/Azure Ubuntu Image
#. Remove the old repo file.
.. code:: sh
sudo rm -rf /etc/apt/sources.list.d/scylla.list
#. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
#. Install:
.. code-block::
sudo apt-get update
sudo apt-get remove scylla\* -y
sudo apt-get install scylla
Answer y to the first two questions.
Restore the configuration file
------------------------------
.. code:: sh
sudo rm -rf /etc/scylla/scylla.yaml
sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
Restore system tables
---------------------
Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
.. code:: sh
cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
Reload systemd configuration
----------------------------
You must reload the unit file if the systemd unit file is changed.
.. code:: sh
sudo systemctl daemon-reload
Start the node
--------------
.. code:: sh
sudo service scylla-server start
Validate
--------
Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.

View File

@@ -0,0 +1,16 @@
.. |OS| replace:: EC2, GCP, and Azure
.. |ROLLBACK| replace:: rollback
.. _ROLLBACK: ./#rollback-procedure
.. |SRC_VERSION| replace:: 5.0
.. |NEW_VERSION| replace:: 5.1
.. |SCYLLA_NAME| replace:: ScyllaDB Image
.. |PKG_NAME| replace:: scylla
.. |APT| replace:: ScyllaDB deb repo
.. _APT: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.1
.. |SCYLLA_REPO| replace:: ScyllaDB deb repo
.. _SCYLLA_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.1
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - Scylla 5.0 to 5.1
.. _SCYLLA_METRICS: ../metric-update-5.0-to-5.1
.. include:: /upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p1.rst
.. include:: /upgrade/_common/upgrade-image-opensource.rst
.. include:: /upgrade/_common/upgrade-guide-v5-ubuntu-and-debian-p2.rst

View File

@@ -0,0 +1,10 @@
.. |OS| replace:: Red Hat Enterprise Linux and CentOS
.. |SRC_VERSION| replace:: 5.0
.. |NEW_VERSION| replace:: 5.1
.. |SCYLLA_NAME| replace:: ScyllaDB
.. |PKG_NAME| replace:: scylla
.. |SCYLLA_REPO| replace:: ScyllaDB rpm repo
.. _SCYLLA_REPO: https://www.scylladb.com/download/?platform=centos&version=scylla-5.1
.. |SCYLLA_METRICS| replace:: Scylla Metrics Update - Scylla 5.0 to 5.1
.. _SCYLLA_METRICS: ../metric-update-5.0-to-5.1
.. include:: /upgrade/_common/upgrade-guide-v4-rpm.rst

View File

@@ -0,0 +1,13 @@
.. |OS| replace:: Ubuntu
.. |ROLLBACK| replace:: rollback
.. _ROLLBACK: ./#rollback-procedure
.. |SRC_VERSION| replace:: 5.0
.. |NEW_VERSION| replace:: 5.1
.. |SCYLLA_NAME| replace:: ScyllaDB
.. |PKG_NAME| replace:: scylla
.. |SCYLLA_REPO| replace:: ScyllaDB deb repo
.. _SCYLLA_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.1
.. |SCYLLA_METRICS| replace:: ScyllaDB Metrics Update - Scylla 5.0 to 5.1
.. _SCYLLA_METRICS: ../metric-update-5.0-to-5.1
.. |UPGRADE_NOTES| replace:: _
.. include:: /upgrade/_common/upgrade-guide-v5-ubuntu-and-debian.rst

View File

@@ -1,21 +0,0 @@
====================================
Upgrade Guide - ScyllaDB 5.1 to 5.2
====================================
.. toctree::
:maxdepth: 2
:hidden:
ScyllaDB <upgrade-guide-from-5.1-to-5.2-generic>
Metrics <metric-update-5.1-to-5.2>
.. panel-box::
:title: Upgrade Scylla
:id: "getting-started"
:class: my-panel
Upgrade guides are available for:
* :doc:`Upgrade ScyllaDB from 5.1.x to 5.2.y <upgrade-guide-from-5.1-to-5.2-generic>`
* :doc:`ScyllaDB Metrics Update - Scylla 5.1 to 5.2 <metric-update-5.1-to-5.2>`

View File

@@ -1,20 +0,0 @@
Scylla Metric Update - Scylla 5.1 to 5.2
========================================
.. toctree::
:maxdepth: 2
:hidden:
Scylla 5.2 Dashboards are available as part of the latest |mon_root|.
The following metrics are new in Scylla 5.2
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. list-table::
:widths: 25 150
:header-rows: 1
* - Metric
- Description
* - TODO
- TODO

View File

@@ -1,411 +0,0 @@
.. |SCYLLA_NAME| replace:: ScyllaDB
.. |SRC_VERSION| replace:: 5.1
.. |NEW_VERSION| replace:: 5.2
.. |DEBIAN_SRC_REPO| replace:: Debian
.. _DEBIAN_SRC_REPO: https://www.scylladb.com/download/?platform=debian-10&version=scylla-5.1
.. |UBUNTU_SRC_REPO| replace:: Ubuntu
.. _UBUNTU_SRC_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.1
.. |SCYLLA_DEB_SRC_REPO| replace:: ScyllaDB deb repo (|DEBIAN_SRC_REPO|_, |UBUNTU_SRC_REPO|_)
.. |SCYLLA_RPM_SRC_REPO| replace:: ScyllaDB rpm repo
.. _SCYLLA_RPM_SRC_REPO: https://www.scylladb.com/download/?platform=centos&version=scylla-5.1
.. |DEBIAN_NEW_REPO| replace:: Debian
.. _DEBIAN_NEW_REPO: https://www.scylladb.com/download/?platform=debian-10&version=scylla-5.2
.. |UBUNTU_NEW_REPO| replace:: Ubuntu
.. _UBUNTU_NEW_REPO: https://www.scylladb.com/download/?platform=ubuntu-20.04&version=scylla-5.2
.. |SCYLLA_DEB_NEW_REPO| replace:: ScyllaDB deb repo (|DEBIAN_NEW_REPO|_, |UBUNTU_NEW_REPO|_)
.. |SCYLLA_RPM_NEW_REPO| replace:: ScyllaDB rpm repo
.. _SCYLLA_RPM_NEW_REPO: https://www.scylladb.com/download/?platform=centos&version=scylla-5.2
.. |ROLLBACK| replace:: rollback
.. _ROLLBACK: ./#rollback-procedure
.. |SCYLLA_METRICS| replace:: Scylla Metrics Update - Scylla 5.1 to 5.2
.. _SCYLLA_METRICS: ../metric-update-5.1-to-5.2
=============================================================================
Upgrade Guide - |SCYLLA_NAME| |SRC_VERSION| to |NEW_VERSION|
=============================================================================
This document is a step by step procedure for upgrading from |SCYLLA_NAME| |SRC_VERSION| to |SCYLLA_NAME| |NEW_VERSION|, and rollback to version |SRC_VERSION| if required.
This guide covers upgrading Scylla on Red Hat Enterprise Linux (RHEL) 7/8, CentOS 7/8, Debian 10 and Ubuntu 20.04. It also applies when using ScyllaDB official image on EC2, GCP, or Azure; the image is based on Ubuntu 20.04.
See :doc:`OS Support by Platform and Version </getting-started/os-support>` for information about supported versions.
Upgrade Procedure
=================
A ScyllaDB upgrade is a rolling procedure which does **not** require full cluster shutdown.
For each of the nodes in the cluster, serially (i.e. one node at a time), you will:
* Check that the cluster's schema is synchronized
* Drain the node and backup the data
* Backup the configuration file
* Stop ScyllaDB
* Download and install new ScyllaDB packages
* (Optional) Enable consistent cluster management in the configuration file
* Start ScyllaDB
* Validate that the upgrade was successful
Apply the following procedure **serially** on each node. Do not move to the next node before validating that the node you upgraded is up and running the new version.
**During** the rolling upgrade, it is highly recommended:
* Not to use the new |NEW_VERSION| features
* Not to run administration functions, like repairs, refresh, rebuild or add or remove nodes. See `sctool <https://manager.docs.scylladb.com/stable/sctool/>`_ for suspending ScyllaDB Manager (only available for ScyllaDB Enterprise) scheduled or running repairs.
* Not to apply schema changes
If you enabled consistent cluster management in each node's configuration file, then as soon as every node has been upgraded to the new version, the cluster will start a procedure which initializes the Raft algorithm for consistent cluster metadata management.
You must then :ref:`verify <validate-raft-setup>` that this procedure successfully finishes.
.. note:: Before upgrading, make sure to use the latest `ScyllaDB Monitoring <https://monitoring.docs.scylladb.com/>`_ stack.
Upgrade Steps
=============
Check the cluster schema
-------------------------
Make sure that all nodes have the schema synchronized before upgrade. The upgrade procedure will fail if there is a schema disagreement between nodes.
.. code:: sh
nodetool describecluster
Drain the nodes and backup the data
-----------------------------------
Before any major procedure, like an upgrade, it is recommended to backup all the data to an external device. In Scylla, backup is done using the ``nodetool snapshot`` command. For **each** node in the cluster, run the following command:
.. code:: sh
nodetool drain
nodetool snapshot
Take note of the directory name that nodetool gives you, and copy all the directories having that name under ``/var/lib/scylla`` to a backup device.
When the upgrade is completed on all nodes, remove the snapshot with the ``nodetool clearsnapshot -t <snapshot>`` command to prevent running out of space.
Backup the configuration file
------------------------------
.. code:: sh
sudo cp -a /etc/scylla/scylla.yaml /etc/scylla/scylla.yaml.backup-src
Gracefully stop the node
------------------------
.. code:: sh
sudo service scylla-server stop
Download and install the new release
------------------------------------
.. tabs::
.. group-tab:: Debian/Ubuntu
Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
**To upgrade ScyllaDB:**
#. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
#. Install the new ScyllaDB version:
.. code-block:: console
sudo apt-get clean all
sudo apt-get update
sudo apt-get dist-upgrade scylla
Answer y to the first two questions.
.. group-tab:: RHEL/CentOS
Before upgrading, check what version you are running now using ``rpm -qa | grep scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
**To upgrade ScyllaDB:**
#. Update the |SCYLLA_RPM_NEW_REPO|_ to |NEW_VERSION|.
#. Install the new ScyllaDB version:
.. code:: sh
sudo yum clean all
sudo yum update scylla\* -y
.. group-tab:: EC2/GCP/Azure Ubuntu Image
Before upgrading, check what version you are running now using ``dpkg -s scylla-server``. You should use the same version as this version in case you want to |ROLLBACK|_ the upgrade. If you are not running a |SRC_VERSION|.x version, stop right here! This guide only covers |SRC_VERSION|.x to |NEW_VERSION|.y upgrades.
There are two alternative upgrade procedures:
* :ref:`Upgrading ScyllaDB and simultaneously updating 3rd party and OS packages <upgrade-image-recommended-procedure>`. It is recommended if you are running a ScyllaDB official image (EC2 AMI, GCP, and Azure images), which is based on Ubuntu 20.04.
* :ref:`Upgrading ScyllaDB without updating any external packages <upgrade-image-upgrade-guide-regular-procedure>`.
.. _upgrade-image-recommended-procedure:
**To upgrade ScyllaDB and update 3rd party and OS packages (RECOMMENDED):**
Choosing this upgrade procedure allows you to upgrade your ScyllaDB version and update the 3rd party and OS packages using one command.
#. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
#. Load the new repo:
.. code:: sh
sudo apt-get update
#. Run the following command to update the manifest file:
.. code:: sh
cat scylla-packages-<version>-<arch>.txt | sudo xargs -n1 apt-get install -y
Where:
* ``<version>`` - The ScyllaDB version to which you are upgrading ( |NEW_VERSION| ).
* ``<arch>`` - Architecture type: ``x86_64`` or ``aarch64``.
The file is included in the ScyllaDB packages downloaded in the previous step. The file location is ``http://downloads.scylladb.com/downloads/scylla/aws/manifest/scylla-packages-<version>-<arch>.txt``
Example:
.. code:: sh
cat scylla-packages-5.2.0-x86_64.txt | sudo xargs -n1 apt-get install -y
.. note::
Alternatively, you can update the manifest file with the following command:
``sudo apt-get install $(awk '{print $1'} scylla-packages-<version>-<arch>.txt) -y``
.. _upgrade-image-upgrade-guide-regular-procedure:
**To upgrade ScyllaDB:**
#. Update the |SCYLLA_DEB_NEW_REPO| to |NEW_VERSION|.
#. Install the new ScyllaDB version:
.. code-block:: console
sudo apt-get clean all
sudo apt-get update
sudo apt-get dist-upgrade scylla
Answer y to the first two questions.
(Optional) Enable consistent cluster management in the node's configuration file
--------------------------------------------------------------------------------
If you enable this option on every node, this will cause the Scylla cluster to enable Raft and use it to consistently manage cluster-wide metadata as soon as you finish upgrading every node to the new version.
Check the :doc:`Raft in ScyllaDB document </architecture/raft/>` to learn more.
.. TODO: include enterprise versions
In 5.2, Raft-based consistent cluster management is disabled by default.
In 5.3 it will be enabled by default, but you'll be able to disable it explicitly during upgrade if needed (assuming you haven't previously enabled it on every node).
In further versions the option will be removed and consistent cluster management will be enabled unconditionally.
The option can also be enabled after the cluster is upgraded to |NEW_VERSION| (see :ref:`Enabling Raft in existing cluster <enabling-raft-existing-cluster>`).
To enable the option, modify the ``scylla.yaml`` configuration file in ``/etc/scylla/`` and add the following:
.. code:: yaml
consistent_cluster_management: true
.. note:: Once you finish upgrading every node with `consistent_cluster_management` enabled, it won't be possible to turn the option back off.
Start the node
--------------
.. code:: sh
sudo service scylla-server start
Validate
--------
#. Check cluster status with ``nodetool status`` and make sure **all** nodes, including the one you just upgraded, are in ``UN`` status.
#. Use ``curl -X GET "http://localhost:10000/storage_service/scylla_release_version"`` to check the ScyllaDB version. Validate that the version matches the one you upgraded to.
#. Check scylla-server log (by ``journalctl _COMM=scylla``) and ``/var/log/syslog`` to validate there are no new errors in the log.
#. Check again after two minutes, to validate no new issues are introduced.
Once you are sure the node upgrade was successful, move to the next node in the cluster.
See |Scylla_METRICS|_ for more information..
After upgrading every node
==========================
The following section applies only if you enabled the ``consistent_cluster_management`` option on every node when upgrading the cluster.
.. _validate-raft-setup:
Validate Raft setup
-------------------
Enabling ``consistent_cluster_management`` on every node during upgrade will cause the Scylla cluster to start an additional internal procedure as soon as every node is upgraded to the new version.
The goal of this procedure is to initialize data structures used by the Raft algorithm to consistently manage cluster-wide metadata such as table schemas.
Assuming you performed the rolling upgrade procedure correctly, in particular ensuring that schema is synchronized on every step, and if there are no problems with cluster connectivity, then this follow-up internal procedure should take no longer than a few seconds to finish.
However, the procedure requires **full cluster availability**. If an unlucky accident (e.g. a hardware problem) causes one of your nodes to fail before this procedure finishes, the procedure may get stuck. This may cause the cluster to end up in a state where schema change operations are unavailable.
Therefore, following the rolling upgrade, **you must verify** that this internal procedure has finished successfully by checking the logs of every Scylla node.
If the procedure gets stuck, manual intervention is required.
Refer to the following document for instructions on how to verify that the procedure was successful and how to proceed if it gets stuck: :ref:`Verifying that the internal Raft upgrade procedure finished successfully <verify-raft-procedure>`.
Rollback Procedure
==================
.. include:: /upgrade/_common/warning_rollback.rst
The following procedure describes a rollback from |SCYLLA_NAME| |NEW_VERSION|.x to |SRC_VERSION|.y. Apply this procedure if an upgrade from |SRC_VERSION| to |NEW_VERSION| failed before completing on all nodes. Use this procedure only for nodes you upgraded to |NEW_VERSION|.
.. warning::
The rollback procedure can be applied **only** if some nodes have not been upgraded to |NEW_VERSION| yet.
As soon as the last node in the rolling upgrade procedure is started with |NEW_VERSION|, rollback becomes impossible.
At that point, the only way to restore a cluster to |SRC_VERSION| is by restoring it from backup.
ScyllaDB rollback is a rolling procedure which does **not** require full cluster shutdown.
For each of the nodes you rollback to |SRC_VERSION|, serially (i.e. one node at a time), you will:
* Drain the node and stop Scylla
* Retrieve the old ScyllaDB packages
* Restore the configuration file
* Restore system tables
* Reload systemd configuration
* Restart ScyllaDB
* Validate the rollback success
Apply the following procedure **serially** on each node. Do not move to the next node before validating that the rollback was successful and the node is up and running the old version.
Rollback Steps
==============
Drain and gracefully stop the node
----------------------------------
.. code:: sh
nodetool drain
sudo service scylla-server stop
Download and install the old release
------------------------------------
..
TODO: downgrade for 3rd party packages in EC2/GCP/Azure - like in the upgrade section?
.. tabs::
.. group-tab:: Debian/Ubuntu
#. Remove the old repo file.
.. code:: sh
sudo rm -rf /etc/apt/sources.list.d/scylla.list
#. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
#. Install:
.. code-block::
sudo apt-get update
sudo apt-get remove scylla\* -y
sudo apt-get install scylla
Answer y to the first two questions.
.. group-tab:: RHEL/CentOS
#. Remove the old repo file.
.. code:: sh
sudo rm -rf /etc/yum.repos.d/scylla.repo
#. Update the |SCYLLA_RPM_SRC_REPO|_ to |SRC_VERSION|.
#. Install:
.. code:: console
sudo yum clean all
sudo rm -rf /var/cache/yum
sudo yum remove scylla\\*tools-core
sudo yum downgrade scylla\\* -y
sudo yum install scylla
.. group-tab:: EC2/GCP/Azure Ubuntu Image
#. Remove the old repo file.
.. code:: sh
sudo rm -rf /etc/apt/sources.list.d/scylla.list
#. Update the |SCYLLA_DEB_SRC_REPO| to |SRC_VERSION|.
#. Install:
.. code-block::
sudo apt-get update
sudo apt-get remove scylla\* -y
sudo apt-get install scylla
Answer y to the first two questions.
Restore the configuration file
------------------------------
.. code:: sh
sudo rm -rf /etc/scylla/scylla.yaml
sudo cp -a /etc/scylla/scylla.yaml.backup-src | /etc/scylla/scylla.yaml
Restore system tables
---------------------
Restore all tables of **system** and **system_schema** from the previous snapshot because |NEW_VERSION| uses a different set of system tables. See :doc:`Restore from a Backup and Incremental Backup </operating-scylla/procedures/backup-restore/restore/>` for reference.
.. code:: sh
cd /var/lib/scylla/data/keyspace_name/table_name-UUID/snapshots/<snapshot_name>/
sudo cp -r * /var/lib/scylla/data/keyspace_name/table_name-UUID/
sudo chown -R scylla:scylla /var/lib/scylla/data/keyspace_name/table_name-UUID/
Reload systemd configuration
----------------------------
You must reload the unit file if the systemd unit file is changed.
.. code:: sh
sudo systemctl daemon-reload
Start the node
--------------
.. code:: sh
sudo service scylla-server start
Validate
--------
Check the upgrade instructions above for validation. Once you are sure the node rollback is successful, move to the next node in the cluster.

View File

@@ -6,6 +6,7 @@ Scylla Enterprise Features
:maxdepth: 2
:hidden:
Lightweight Transactions </using-scylla/lwt/>
Workload Prioritization </using-scylla/workload-prioritization/>
In-memory tables </using-scylla/in-memory/>
Global Secondary Indexes </using-scylla/secondary-indexes/>

View File

@@ -2,9 +2,10 @@
Scylla in-memory tables
=========================
:label-tip:`ScyllaDB Enterprise`
.. versionadded:: 2018.1.7
.. versionadded:: 2018.1.7 Scylla Enterprise
.. include:: /rst_include/enterprise-only-note.rst
Overview
========

View File

@@ -2,7 +2,9 @@
Workload Prioritization
========================
:label-tip:`ScyllaDB Enterprise`
.. include:: /rst_include/enterprise-only-note.rst
In a typical database there are numerous workloads running at the same time.
Each workload type dictates a different acceptable level of latency and throughput.

View File

@@ -25,7 +25,6 @@ static const std::map<application_state, sstring> application_state_names = {
{application_state::REMOVAL_COORDINATOR, "REMOVAL_COORDINATOR"},
{application_state::INTERNAL_IP, "INTERNAL_IP"},
{application_state::RPC_ADDRESS, "RPC_ADDRESS"},
{application_state::RAFT_SERVER_ID, "RAFT_SERVER_ID"},
{application_state::SEVERITY, "SEVERITY"},
{application_state::NET_VERSION, "NET_VERSION"},
{application_state::HOST_ID, "HOST_ID"},

View File

@@ -38,11 +38,8 @@ enum class application_state {
IGNORE_MSB_BITS,
CDC_GENERATION_ID,
SNITCH_NAME,
// RAFT ID is a server identifier which is maintained
// and gossiped in addition to HOST_ID because it's truly
// unique: any new node gets a new RAFT ID, while may keep
// its existing HOST ID, e.g. if it's replacing an existing node.
RAFT_SERVER_ID,
// pad to allow adding new states to existing cluster
X10,
};
std::ostream& operator<<(std::ostream& os, const application_state& m);

View File

@@ -60,6 +60,9 @@ feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring>
if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_STREAMS)) {
fcfg._disabled_features.insert("ALTERNATOR_STREAMS"s);
}
if (!cfg.check_experimental(db::experimental_features_t::feature::ALTERNATOR_TTL)) {
fcfg._disabled_features.insert("ALTERNATOR_TTL"s);
}
if (!cfg.check_experimental(db::experimental_features_t::feature::RAFT)) {
fcfg._disabled_features.insert("SUPPORTS_RAFT_CLUSTER_MANAGEMENT"s);
}

View File

@@ -100,7 +100,7 @@ gossiper::gossiper(abort_source& as, feature_service& features, const locator::s
, _failure_detector_timeout_ms(cfg.failure_detector_timeout_in_ms)
, _force_gossip_generation(cfg.force_gossip_generation)
, _gcfg(std::move(gcfg))
, _echo_pinger(*this) {
, _direct_fd_pinger(*this) {
// Gossiper's stuff below runs only on CPU0
if (this_shard_id() != 0) {
return;
@@ -726,7 +726,7 @@ future<> gossiper::do_status_check() {
// check for dead state removal
auto expire_time = get_expire_time_for_endpoint(endpoint);
if (!is_alive && (now > expire_time)
&& (!get_token_metadata_ptr()->is_normal_token_owner(endpoint))) {
&& (!get_token_metadata_ptr()->is_member(endpoint))) {
logger.debug("time is expiring for endpoint : {} ({})", endpoint, expire_time.time_since_epoch().count());
co_await evict_from_membership(endpoint);
}
@@ -970,7 +970,7 @@ void gossiper::run() {
}).get();
}
_echo_pinger.update_generation_number(_endpoint_state_map[get_broadcast_address()].get_heart_beat_state().get_generation()).get();
_direct_fd_pinger.update_generation_number(_endpoint_state_map[get_broadcast_address()].get_heart_beat_state().get_generation()).get();
}).then_wrapped([this] (auto&& f) {
try {
f.get();
@@ -1020,10 +1020,10 @@ std::set<inet_address> gossiper::get_live_members() const {
std::set<inet_address> gossiper::get_live_token_owners() const {
std::set<inet_address> token_owners;
auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
for (auto& node: normal_token_owners) {
if (is_alive(node)) {
token_owners.insert(node);
for (auto& member : get_live_members()) {
auto es = get_endpoint_state_for_endpoint_ptr(member);
if (es && !is_dead_state(*es) && get_token_metadata_ptr()->is_member(member)) {
token_owners.insert(member);
}
}
return token_owners;
@@ -1031,10 +1031,10 @@ std::set<inet_address> gossiper::get_live_token_owners() const {
std::set<inet_address> gossiper::get_unreachable_token_owners() const {
std::set<inet_address> token_owners;
auto normal_token_owners = get_token_metadata_ptr()->get_all_endpoints();
for (auto& node: normal_token_owners) {
if (!is_alive(node)) {
token_owners.insert(node);
for (auto&& x : _unreachable_endpoints) {
auto& endpoint = x.first;
if (get_token_metadata_ptr()->is_member(endpoint)) {
token_owners.insert(endpoint);
}
}
return token_owners;
@@ -1300,7 +1300,7 @@ bool gossiper::is_gossip_only_member(inet_address endpoint) {
if (!es) {
return false;
}
return !is_dead_state(*es) && !get_token_metadata_ptr()->is_normal_token_owner(endpoint);
return !is_dead_state(*es) && !get_token_metadata_ptr()->is_member(endpoint);
}
clk::time_point gossiper::get_expire_time_for_endpoint(inet_address endpoint) const noexcept {
@@ -1852,7 +1852,7 @@ future<> gossiper::start_gossiping(int generation_nbr, std::map<application_stat
co_await container().invoke_on_all([] (gms::gossiper& g) {
g._failure_detector_loop_done = g.failure_detector_loop();
});
co_await _echo_pinger.update_generation_number(generation_nbr);
co_await _direct_fd_pinger.update_generation_number(generation_nbr);
}
future<std::unordered_map<gms::inet_address, int32_t>>
@@ -2538,18 +2538,68 @@ locator::token_metadata_ptr gossiper::get_token_metadata_ptr() const noexcept {
return _shared_token_metadata.get();
}
future<> echo_pinger::update_generation_number(int64_t n) {
future<> gossiper::direct_fd_pinger::update_generation_number(int64_t n) {
if (n <= _generation_number) {
return make_ready_future<>();
}
return _gossiper.container().invoke_on_all([n] (gossiper& g) {
g._echo_pinger._generation_number = n;
g._direct_fd_pinger._generation_number = n;
});
}
future<> echo_pinger::ping(const gms::inet_address& addr, abort_source& as) {
return _gossiper._messaging.send_gossip_echo(netw::msg_addr(addr), _generation_number, as);
direct_failure_detector::pinger::endpoint_id gossiper::direct_fd_pinger::allocate_id(gms::inet_address addr) {
assert(this_shard_id() == 0);
auto it = _addr_to_id.find(addr);
if (it == _addr_to_id.end()) {
auto id = _next_allocated_id++;
_id_to_addr.emplace(id, addr);
it = _addr_to_id.emplace(addr, id).first;
logger.debug("gossiper::direct_fd_pinger: assigned endpoint ID {} to address {}", id, addr);
}
return it->second;
}
future<gms::inet_address> gossiper::direct_fd_pinger::get_address(direct_failure_detector::pinger::endpoint_id id) {
auto it = _id_to_addr.find(id);
if (it == _id_to_addr.end()) {
// Fetch the address from shard 0. By precondition it must be there.
auto addr = co_await _gossiper.container().invoke_on(0, [id] (gossiper& g) {
auto it = g._direct_fd_pinger._id_to_addr.find(id);
if (it == g._direct_fd_pinger._id_to_addr.end()) {
on_internal_error(logger, format("gossiper::direct_fd_pinger: endpoint id {} has no corresponding address", id));
}
return it->second;
});
it = _id_to_addr.emplace(id, addr).first;
}
co_return it->second;
}
future<bool> gossiper::direct_fd_pinger::ping(direct_failure_detector::pinger::endpoint_id id, abort_source& as) {
try {
co_await _gossiper._messaging.send_gossip_echo(netw::msg_addr(co_await get_address(id)), _generation_number, as);
} catch (seastar::rpc::closed_error&) {
co_return false;
}
co_return true;
}
} // namespace gms
direct_failure_detector::clock::timepoint_t direct_fd_clock::now() noexcept {
return base::now().time_since_epoch().count();
}
future<> direct_fd_clock::sleep_until(direct_failure_detector::clock::timepoint_t tp, abort_source& as) {
auto t = base::time_point{base::duration{tp}};
auto n = base::now();
if (t <= n) {
return make_ready_future<>();
}
return sleep_abortable(t - n, as);
}

View File

@@ -82,23 +82,6 @@ struct gossip_config {
uint32_t skip_wait_for_gossip_to_settle = -1;
};
class gossiper;
// Caches the gossiper's generation number, which is required for sending gossip echo messages.
// Call `ping` to send a gossip echo message to the given address using the last known generation number.
// The generation number is updated by gossiper's loop and replicated to every shard.
class echo_pinger {
friend class gossiper;
gossiper& _gossiper;
int64_t _generation_number{0};
future<> update_generation_number(int64_t n);
echo_pinger(gossiper& g) : _gossiper(g) {}
public:
future<> ping(const gms::inet_address&, abort_source&);
};
/**
* This module is responsible for Gossiping information for the local endpoint. This abstraction
* maintains the list of live and dead endpoints. Periodically i.e. every 1 second this module
@@ -112,7 +95,6 @@ public:
* the Failure Detector.
*/
class gossiper : public seastar::async_sharded_service<gossiper>, public seastar::peering_sharded_service<gossiper> {
friend class echo_pinger;
public:
using clk = seastar::lowres_system_clock;
using ignore_features_of_local_node = bool_class<class ignore_features_of_local_node_tag>;
@@ -623,13 +605,53 @@ private:
future<> update_live_endpoints_version();
public:
echo_pinger& get_echo_pinger() { return _echo_pinger; }
// Implementation of `direct_failure_detector::pinger` which uses gossip echo messages for pinging.
// The gossip echo message must be provided this node's gossip generation number.
// It's an integer incremented when the node restarts or when the gossip subsystem restarts.
// We cache the generation number inside `direct_fd_pinger` on every shard and update it in the `gossiper` main loop.
//
// We also store a mapping between `direct_failure_detector::pinger::endpoint_id`s and `inet_address`es.
class direct_fd_pinger : public direct_failure_detector::pinger {
friend class gossiper;
gossiper& _gossiper;
// Only used on shard 0 by `allocate_id`.
direct_failure_detector::pinger::endpoint_id _next_allocated_id{0};
// The mappings are created on shard 0 and lazily replicated to other shards:
// when `ping` or `get_address` is called with an unknown ID on a different shard, it will fetch the ID from shard 0.
std::unordered_map<direct_failure_detector::pinger::endpoint_id, inet_address> _id_to_addr;
// Used to quickly check if given address already has an assigned ID.
// Used only on shard 0, not replicated.
std::unordered_map<inet_address, direct_failure_detector::pinger::endpoint_id> _addr_to_id;
// This node's gossip generation number, updated by gossiper's loop and replicated to every shard.
int64_t _generation_number{0};
future<> update_generation_number(int64_t n);
direct_fd_pinger(gossiper& g) : _gossiper(g) {}
public:
direct_fd_pinger(const direct_fd_pinger&) = delete;
// Allocate a new endpoint_id for `addr`, or if one already exists, return it.
// Call only on shard 0.
direct_failure_detector::pinger::endpoint_id allocate_id(gms::inet_address addr);
// Precondition: `id` was returned from `allocate_id` on shard 0 earlier.
future<gms::inet_address> get_address(direct_failure_detector::pinger::endpoint_id id);
future<bool> ping(direct_failure_detector::pinger::endpoint_id id, abort_source& as) override;
};
direct_fd_pinger& get_direct_fd_pinger() { return _direct_fd_pinger; }
private:
echo_pinger _echo_pinger;
direct_fd_pinger _direct_fd_pinger;
};
struct gossip_get_endpoint_states_request {
// Application states the sender requested
std::unordered_set<gms::application_state> application_states;
@@ -640,3 +662,11 @@ struct gossip_get_endpoint_states_response {
};
} // namespace gms
// XXX: find a better place to put this?
struct direct_fd_clock : public direct_failure_detector::clock {
using base = std::chrono::steady_clock;
direct_failure_detector::clock::timepoint_t now() noexcept override;
future<> sleep_until(direct_failure_detector::clock::timepoint_t tp, abort_source& as) override;
};

View File

@@ -151,10 +151,6 @@ public:
return versioned_value(host_id.to_sstring());
}
static versioned_value raft_server_id(const utils::UUID& id) {
return versioned_value(id.to_sstring());
}
static versioned_value tokens(const std::unordered_set<dht::token>& tokens) {
return versioned_value(make_full_token_string(tokens));
}

View File

@@ -80,11 +80,6 @@ struct not_a_leader {
raft::server_id leader;
};
struct transient_error {
sstring message();
raft::server_id leader;
};
struct commit_status_unknown {
};

View File

@@ -45,7 +45,6 @@ debian_base_packages=(
pigz
libunistring-dev
libzstd-dev
libdeflate-dev
)
fedora_packages=(
@@ -59,7 +58,6 @@ fedora_packages=(
jsoncpp-devel
rapidjson-devel
snappy-devel
libdeflate-devel
systemd-devel
git
python
@@ -172,11 +170,11 @@ arch_packages=(
thrift
)
NODE_EXPORTER_VERSION=1.4.0
NODE_EXPORTER_VERSION=1.3.1
declare -A NODE_EXPORTER_CHECKSUM=(
["x86_64"]=e77ff1b0a824a4e13f82a35d98595fe526849c09e3480d0789a56b72242d2abc
["aarch64"]=0b20aa75385a42857a67ee5f6c7f67b229039a22a49c5c61c33f071356415b59
["s390x"]=a98e2aa5f9e557441190d233ba752c0cae28f3130c6a6742b038f3997d034065
["x86_64"]=68f3802c2dd3980667e4ba65ea2e1fb03f4a4ba026cca375f15a0390ff850949
["aarch64"]=f19f35175f87d41545fa7d4657e834e3a37c1fe69f3bf56bc031a256117764e7
["s390x"]=a12802101a5ee1c74c91bdaa5403c00011ebdf36b83b617c903dbd356a978d03
)
declare -A NODE_EXPORTER_ARCH=(
["x86_64"]=amd64
@@ -316,7 +314,7 @@ elif [ "$ID" = "fedora" ]; then
pip3 install "$PIP_DEFAULT_ARGS" traceback-with-variables
pip3 install "$PIP_DEFAULT_ARGS" scylla-api-client
cargo --config net.git-fetch-with-cli=true install cxxbridge-cmd --root /usr/local
cargo install cxxbridge-cmd --root /usr/local
if [ -f "$(node_exporter_fullpath)" ] && node_exporter_checksum; then
echo "$(node_exporter_filename) already exists, skipping download"
else

View File

@@ -70,7 +70,6 @@ upgrade=false
supervisor=false
supervisor_log_to_stdout=false
without_systemd=false
skip_systemd_check=false
while [ $# -gt 0 ]; do
case "$1" in
@@ -100,7 +99,6 @@ while [ $# -gt 0 ]; do
;;
"--packaging")
packaging=true
skip_systemd_check=true
shift 1
;;
"--upgrade")
@@ -109,7 +107,6 @@ while [ $# -gt 0 ]; do
;;
"--supervisor")
supervisor=true
skip_systemd_check=true
shift 1
;;
"--supervisor-log-to-stdout")
@@ -118,7 +115,6 @@ while [ $# -gt 0 ]; do
;;
"--without-systemd")
without_systemd=true
skip_systemd_check=true
shift 1
;;
"--help")
@@ -250,7 +246,7 @@ supervisor_conf() {
fi
}
if ! $skip_systemd_check && [ ! -d /run/systemd/system/ ]; then
if ! $packaging && [ ! -d /run/systemd/system/ ] && ! $supervisor; then
echo "systemd is not detected, unsupported distribution."
exit 1
fi
@@ -570,7 +566,7 @@ if $nonroot; then
# nonroot install is also 'offline install'
touch $rprefix/SCYLLA-OFFLINE-FILE
touch $rprefix/SCYLLA-NONROOT-FILE
if ! $without_systemd_check && check_usermode_support; then
if ! $supervisor && ! $packaging && ! $without_systemd && check_usermode_support; then
systemctl --user daemon-reload
fi
echo "Scylla non-root install completed."

1
libdeflate Submodule

Submodule libdeflate added at e7e54eab42

View File

@@ -12,8 +12,6 @@
#include <boost/range/algorithm/remove_if.hpp>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include "replica/database.hh"
#include "utils/stall_free.hh"
namespace locator {
@@ -229,7 +227,7 @@ abstract_replication_strategy::get_address_ranges(const token_metadata& tm) cons
future<std::unordered_multimap<inet_address, dht::token_range>>
abstract_replication_strategy::get_address_ranges(const token_metadata& tm, inet_address endpoint) const {
std::unordered_multimap<inet_address, dht::token_range> ret;
if (!tm.is_normal_token_owner(endpoint)) {
if (!tm.is_member(endpoint)) {
co_return ret;
}
bool is_everywhere_topology = get_type() == replication_strategy_type::everywhere_topology;
@@ -470,44 +468,6 @@ void effective_replication_map_factory::submit_background_work(future<> fut) {
});
}
future<> global_effective_replication_map::get_keyspace_erms(sharded<replica::database>& sharded_db, std::string_view keyspace_name) {
return sharded_db.invoke_on(0, [this, &sharded_db, keyspace_name] (replica::database& db) -> future<> {
// To ensure we get the same effective_replication_map
// on all shards, acquire the shared_token_metadata lock.
//
// As a sanity check compare the ring_version on each shard
// to the reference version on shard 0.
//
// This invariant is achieved by storage_service::mutate_token_metadata
// and storage_service::replicate_to_all_cores that first acquire the
// shared_token_metadata lock, then prepare a mutated token metadata
// that will have an incremented ring_version, use it to re-calculate
// all e_r_m:s and clone both on all shards. including the ring version,
// all under the lock.
auto lk = co_await db.get_shared_token_metadata().get_lock();
auto erm = db.find_keyspace(keyspace_name).get_effective_replication_map();
auto ring_version = erm->get_token_metadata().get_ring_version();
_erms[0] = make_foreign(std::move(erm));
co_await coroutine::parallel_for_each(boost::irange(1u, smp::count), [this, &sharded_db, keyspace_name, ring_version] (unsigned shard) -> future<> {
_erms[shard] = co_await sharded_db.invoke_on(shard, [keyspace_name, ring_version] (const replica::database& db) {
const auto& ks = db.find_keyspace(keyspace_name);
auto erm = ks.get_effective_replication_map();
auto local_ring_version = erm->get_token_metadata().get_ring_version();
if (local_ring_version != ring_version) {
on_internal_error(rslogger, format("Inconsistent effective_replication_map ring_verion {}, expected {}", local_ring_version, ring_version));
}
return make_foreign(std::move(erm));
});
});
});
}
future<global_effective_replication_map> make_global_effective_replication_map(sharded<replica::database>& sharded_db, std::string_view keyspace_name) {
global_effective_replication_map ret;
co_await ret.get_keyspace_erms(sharded_db, keyspace_name);
co_return ret;
}
} // namespace locator
std::ostream& operator<<(std::ostream& os, locator::replication_strategy_type t) {

View File

@@ -22,7 +22,6 @@
// forward declaration since replica/database.hh includes this file
namespace replica {
class database;
class keyspace;
}
@@ -102,7 +101,7 @@ public:
virtual inet_address_vector_replica_set get_natural_endpoints(const token& search_token, const effective_replication_map& erm) const;
virtual void validate_options() const = 0;
virtual std::optional<std::unordered_set<sstring>> recognized_options(const topology&) const = 0;
virtual std::optional<std::set<sstring>> recognized_options(const topology&) const = 0;
virtual size_t get_replication_factor(const token_metadata& tm) const = 0;
// Decide if the replication strategy allow removing the node being
// replaced from the natural endpoints when a node is being replaced in the
@@ -266,33 +265,6 @@ inline mutable_effective_replication_map_ptr make_effective_replication_map(abst
// Apply the replication strategy over the current configuration and the given token_metadata.
future<mutable_effective_replication_map_ptr> calculate_effective_replication_map(abstract_replication_strategy::ptr_type rs, token_metadata_ptr tmptr);
// Class to hold a coherent view of a keyspace
// effective replication map on all shards
class global_effective_replication_map {
std::vector<foreign_ptr<effective_replication_map_ptr>> _erms;
public:
global_effective_replication_map() : _erms(smp::count) {}
global_effective_replication_map(global_effective_replication_map&&) = default;
global_effective_replication_map& operator=(global_effective_replication_map&&) = default;
future<> get_keyspace_erms(sharded<replica::database>& sharded_db, std::string_view keyspace_name);
const effective_replication_map& get() const noexcept {
return *_erms[this_shard_id()];
}
const effective_replication_map& operator*() const noexcept {
return get();
}
const effective_replication_map* operator->() const noexcept {
return &get();
}
};
future<global_effective_replication_map> make_global_effective_replication_map(sharded<replica::database>& sharded_db, std::string_view keyspace_name);
} // namespace locator
std::ostream& operator<<(std::ostream& os, locator::replication_strategy_type);

View File

@@ -24,7 +24,7 @@ public:
virtual void validate_options() const override { /* noop */ }
std::optional<std::unordered_set<sstring>> recognized_options(const topology&) const override {
std::optional<std::set<sstring>> recognized_options(const topology&) const override {
// We explicitely allow all options
return std::nullopt;
}

View File

@@ -46,10 +46,6 @@ gossiping_property_file_snitch::gossiping_property_file_snitch(const snitch_conf
if (this_shard_id() == _file_reader_cpu_id) {
io_cpu_id() = _file_reader_cpu_id;
}
if (_listen_address->addr().is_addr_any()) {
logger().warn("Not gossiping INADDR_ANY as internal IP");
_listen_address.reset();
}
}
future<> gossiping_property_file_snitch::start() {
@@ -108,15 +104,12 @@ void gossiping_property_file_snitch::periodic_reader_callback() {
}
std::list<std::pair<gms::application_state, gms::versioned_value>> gossiping_property_file_snitch::get_app_states() const {
std::list<std::pair<gms::application_state, gms::versioned_value>> ret = {
sstring ip = format("{}", _listen_address);
return {
{gms::application_state::DC, gms::versioned_value::datacenter(_my_dc)},
{gms::application_state::RACK, gms::versioned_value::rack(_my_rack)},
{gms::application_state::INTERNAL_IP, gms::versioned_value::internal_ip(std::move(ip))},
};
if (_listen_address.has_value()) {
sstring ip = format("{}", *_listen_address);
ret.emplace_back(gms::application_state::INTERNAL_IP, gms::versioned_value::internal_ip(std::move(ip)));
}
return ret;
}
future<> gossiping_property_file_snitch::read_property_file() {

View File

@@ -93,7 +93,7 @@ private:
unsigned _file_reader_cpu_id;
snitch_signal_t _reconfigured;
promise<> _io_is_stopped;
std::optional<gms::inet_address> _listen_address;
gms::inet_address _listen_address;
void reset_io_state() {
// Reset the promise to allow repeating

View File

@@ -24,7 +24,7 @@ future<endpoint_set> local_strategy::calculate_natural_endpoints(const token& t,
void local_strategy::validate_options() const {
}
std::optional<std::unordered_set<sstring>> local_strategy::recognized_options(const topology&) const {
std::optional<std::set<sstring>> local_strategy::recognized_options(const topology&) const {
// LocalStrategy doesn't expect any options.
return {};
}

Some files were not shown because too many files have changed in this diff Show More