Compare commits
161 Commits
scylla-3.0
...
next-3.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afa2c1b0bf | ||
|
|
ad70fe8503 | ||
|
|
3cd9c78056 | ||
|
|
c5e5ed2775 | ||
|
|
666266c3cf | ||
|
|
19b5d70338 | ||
|
|
b3cdee7e27 | ||
|
|
4c42f18d82 | ||
|
|
ea8f8ab7a3 | ||
|
|
db6821ce8f | ||
|
|
3c91bad0dc | ||
|
|
bbe41a82be | ||
|
|
6fb42269e9 | ||
|
|
ee2255a189 | ||
|
|
3218e6cd4c | ||
|
|
1d94aac551 | ||
|
|
2e5110d063 | ||
|
|
e4bb7ce73c | ||
|
|
ecc54c1a68 | ||
|
|
71cfd108c6 | ||
|
|
d40a7a5e9e | ||
|
|
a163d245ec | ||
|
|
045831b706 | ||
|
|
148245ab6a | ||
|
|
bbe5de1403 | ||
|
|
ca0df416c0 | ||
|
|
37ed60374e | ||
|
|
7c991a276b | ||
|
|
72e039be85 | ||
|
|
a28ecc4714 | ||
|
|
584c555698 | ||
|
|
e772f11ee0 | ||
|
|
d79b6a7481 | ||
|
|
85168c500c | ||
|
|
5b9e2cd6e6 | ||
|
|
77f33ca106 | ||
|
|
93760f13ee | ||
|
|
e597ae1176 | ||
|
|
79c7015cce | ||
|
|
00a14000cd | ||
|
|
1c40a0fcd2 | ||
|
|
e10735852b | ||
|
|
42433a25a8 | ||
|
|
d04d3fa653 | ||
|
|
1bcc5a1b5c | ||
|
|
450b9ac9bf | ||
|
|
b3bfd8c08d | ||
|
|
53c10b72dc | ||
|
|
a690e20966 | ||
|
|
7172009a0d | ||
|
|
cb688ef62e | ||
|
|
ff8265dd66 | ||
|
|
a198db31dc | ||
|
|
094a2a4263 | ||
|
|
cc0b4d249b | ||
|
|
e10afc7f50 | ||
|
|
407dfe0d68 | ||
|
|
9370996a18 | ||
|
|
ac105dd2a7 | ||
|
|
1e62fc8aac | ||
|
|
c724eee649 | ||
|
|
ebb14d93c9 | ||
|
|
d77aaada86 | ||
|
|
acd05e089f | ||
|
|
f591c9c710 | ||
|
|
dea4489078 | ||
|
|
3172cc6bac | ||
|
|
840d466c4d | ||
|
|
e30c289835 | ||
|
|
f769828a68 | ||
|
|
7d743563bf | ||
|
|
23da53c4f3 | ||
|
|
d4df119735 | ||
|
|
bdcbf4aa4e | ||
|
|
e80cd9dfed | ||
|
|
87fd298a6e | ||
|
|
7dce5484c2 | ||
|
|
23df964b96 | ||
|
|
fcab0d1392 | ||
|
|
a0c4a8501e | ||
|
|
b6fa715f7b | ||
|
|
9b3ca26d7f | ||
|
|
7b8e570e6c | ||
|
|
a947f2cd84 | ||
|
|
5ce5f61b08 | ||
|
|
7b65ec866b | ||
|
|
4c16c1fe1b | ||
|
|
f2d2a9f5b8 | ||
|
|
cb3b687492 | ||
|
|
1bb84cdbcf | ||
|
|
b6307d54be | ||
|
|
a20000c1a2 | ||
|
|
b3cbc2e58a | ||
|
|
e4c1c4f052 | ||
|
|
bfe3b4cc59 | ||
|
|
6a4bc5bd71 | ||
|
|
6c818bcec0 | ||
|
|
1598d358f0 | ||
|
|
7252715c69 | ||
|
|
37e143cba5 | ||
|
|
bf68fae01b | ||
|
|
d566466fca | ||
|
|
e32e682911 | ||
|
|
3c46bbf244 | ||
|
|
5567cf4b1b | ||
|
|
733c04ad50 | ||
|
|
05913b6f58 | ||
|
|
79cf277ea2 | ||
|
|
03ada48b40 | ||
|
|
394afae3a8 | ||
|
|
69d0b1e15c | ||
|
|
403f66ecad | ||
|
|
841ceac4f9 | ||
|
|
0fce4b228e | ||
|
|
2336c092a0 | ||
|
|
2b326fc7fa | ||
|
|
a62edaf7a9 | ||
|
|
d527ef19f7 | ||
|
|
8568dc94f4 | ||
|
|
6e51a95668 | ||
|
|
071191b967 | ||
|
|
c6c841c34f | ||
|
|
83a8f779bb | ||
|
|
6066968e33 | ||
|
|
d3d877b9db | ||
|
|
5ec646cb4e | ||
|
|
68b54b2e52 | ||
|
|
66a48746b8 | ||
|
|
b2227c7a5e | ||
|
|
97357a7321 | ||
|
|
089e41999a | ||
|
|
c537b3dd8e | ||
|
|
75a737c958 | ||
|
|
ea0f1c039d | ||
|
|
27cf758f12 | ||
|
|
76fd69244a | ||
|
|
0eb2ea8f00 | ||
|
|
20eaf0b85f | ||
|
|
751fdc9f6c | ||
|
|
5e3a52024e | ||
|
|
3869b5ab51 | ||
|
|
3cca6f5384 | ||
|
|
15188b5ea5 | ||
|
|
96d9ebb67e | ||
|
|
f18b370198 | ||
|
|
8e657e5685 | ||
|
|
4fde670abf | ||
|
|
923318e636 | ||
|
|
35cc09b150 | ||
|
|
22f41f04ba | ||
|
|
fae11c0d6b | ||
|
|
82016c07f2 | ||
|
|
282ccbb072 | ||
|
|
873e0f0e14 | ||
|
|
c36c58c64e | ||
|
|
0685c8f5bc | ||
|
|
92cf2934c6 | ||
|
|
ed2fb65732 | ||
|
|
ce2957d106 | ||
|
|
b31d94e317 | ||
|
|
da80f27f44 |
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=3.0.2
|
||||
VERSION=3.0.11
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -241,7 +241,11 @@ future<authenticated_user> password_authenticator::authenticate(
|
||||
}).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
|
||||
try {
|
||||
auto res = f.get0();
|
||||
if (res->empty() || !passwords::check(password, res->one().get_as<sstring>(SALTED_HASH))) {
|
||||
auto salted_hash = std::experimental::optional<sstring>();
|
||||
if (!res->empty()) {
|
||||
salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
|
||||
}
|
||||
if (!salted_hash || !passwords::check(password, *salted_hash)) {
|
||||
throw exceptions::authentication_exception("Username and/or password are incorrect");
|
||||
}
|
||||
return make_ready_future<authenticated_user>(username);
|
||||
|
||||
@@ -184,7 +184,9 @@ future<> service::start() {
|
||||
return once_among_shards([this] {
|
||||
return create_keyspace_if_missing();
|
||||
}).then([this] {
|
||||
return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
|
||||
return _role_manager->start().then([this] {
|
||||
return when_all_succeed(_authorizer->start(), _authenticator->start());
|
||||
});
|
||||
}).then([this] {
|
||||
_permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
|
||||
}).then([this] {
|
||||
|
||||
@@ -61,6 +61,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
||||
// - _last_row points at a direct predecessor of the next row which is going to be read.
|
||||
// Used for populating continuity.
|
||||
// - _population_range_starts_before_all_rows is set accordingly
|
||||
// - _underlying is engaged and fast-forwarded
|
||||
reading_from_underlying,
|
||||
|
||||
end_of_stream
|
||||
@@ -99,7 +100,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
||||
// forward progress is not guaranteed in case iterators are getting constantly invalidated.
|
||||
bool _lower_bound_changed = false;
|
||||
|
||||
// Points to the underlying reader conforming to _schema,
|
||||
// either to *_underlying_holder or _read_context->underlying().underlying().
|
||||
flat_mutation_reader* _underlying = nullptr;
|
||||
std::optional<flat_mutation_reader> _underlying_holder;
|
||||
|
||||
future<> do_fill_buffer(db::timeout_clock::time_point);
|
||||
future<> ensure_underlying(db::timeout_clock::time_point);
|
||||
void copy_from_cache_to_buffer();
|
||||
future<> process_static_row(db::timeout_clock::time_point);
|
||||
void move_to_end();
|
||||
@@ -186,23 +193,22 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
_read_context->cache().on_row_miss();
|
||||
return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
|
||||
if (sr) {
|
||||
assert(sr->is_static_row());
|
||||
maybe_add_to_cache(sr->as_static_row());
|
||||
push_mutation_fragment(std::move(*sr));
|
||||
}
|
||||
maybe_set_static_row_continuous();
|
||||
return ensure_underlying(timeout).then([this, timeout] {
|
||||
return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
|
||||
if (sr) {
|
||||
assert(sr->is_static_row());
|
||||
maybe_add_to_cache(sr->as_static_row());
|
||||
push_mutation_fragment(std::move(*sr));
|
||||
}
|
||||
maybe_set_static_row_continuous();
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void cache_flat_mutation_reader::touch_partition() {
|
||||
if (_snp->at_latest_version()) {
|
||||
rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
|
||||
_snp->tracker()->touch(last_dummy);
|
||||
}
|
||||
_snp->touch();
|
||||
}
|
||||
|
||||
inline
|
||||
@@ -232,14 +238,36 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
|
||||
});
|
||||
}
|
||||
|
||||
inline
|
||||
future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
|
||||
if (_underlying) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _read_context->ensure_underlying(timeout).then([this, timeout] {
|
||||
flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
|
||||
if (ctx_underlying.schema() != _schema) {
|
||||
_underlying_holder = make_delegating_reader(ctx_underlying);
|
||||
_underlying_holder->upgrade_schema(_schema);
|
||||
_underlying = &*_underlying_holder;
|
||||
} else {
|
||||
_underlying = &ctx_underlying;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
inline
|
||||
future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
if (_state == state::move_to_underlying) {
|
||||
if (!_underlying) {
|
||||
return ensure_underlying(timeout).then([this, timeout] {
|
||||
return do_fill_buffer(timeout);
|
||||
});
|
||||
}
|
||||
_state = state::reading_from_underlying;
|
||||
_population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
|
||||
auto end = _next_row_in_range ? position_in_partition(_next_row.position())
|
||||
: position_in_partition(_upper_bound);
|
||||
return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
||||
return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
||||
return read_from_underlying(timeout);
|
||||
});
|
||||
}
|
||||
@@ -280,7 +308,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
|
||||
|
||||
inline
|
||||
future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
|
||||
return consume_mutation_fragments_until(_read_context->underlying().underlying(),
|
||||
return consume_mutation_fragments_until(*_underlying,
|
||||
[this] { return _state != state::reading_from_underlying || is_buffer_full(); },
|
||||
[this] (mutation_fragment mf) {
|
||||
_read_context->cache().on_row_miss();
|
||||
|
||||
@@ -273,6 +273,7 @@ scylla_tests = [
|
||||
'tests/perf/perf_sstable',
|
||||
'tests/cql_query_test',
|
||||
'tests/secondary_index_test',
|
||||
'tests/json_cql_query_test',
|
||||
'tests/filtering_test',
|
||||
'tests/storage_proxy_test',
|
||||
'tests/schema_change_test',
|
||||
@@ -570,6 +571,7 @@ scylla_core = (['database.cc',
|
||||
'db/consistency_level.cc',
|
||||
'db/system_keyspace.cc',
|
||||
'db/system_distributed_keyspace.cc',
|
||||
'db/size_estimates_virtual_reader.cc',
|
||||
'db/schema_tables.cc',
|
||||
'db/cql_type_parser.cc',
|
||||
'db/legacy_schema_migrator.cc',
|
||||
|
||||
@@ -470,6 +470,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
|
||||
std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
|
||||
std::vector<::shared_ptr<cql3::term::raw>> values;
|
||||
bool if_not_exists = false;
|
||||
bool default_unset = false;
|
||||
::shared_ptr<cql3::term::raw> json_value;
|
||||
}
|
||||
: K_INSERT K_INTO cf=columnFamilyName
|
||||
@@ -487,13 +488,15 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
|
||||
}
|
||||
| K_JSON
|
||||
json_token=jsonValue { json_value = $json_token.value; }
|
||||
( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
|
||||
( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
|
||||
( usingClause[attrs] )?
|
||||
{
|
||||
$expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
|
||||
std::move(attrs),
|
||||
std::move(json_value),
|
||||
if_not_exists);
|
||||
if_not_exists,
|
||||
default_unset);
|
||||
}
|
||||
)
|
||||
;
|
||||
@@ -1835,6 +1838,8 @@ K_OR: O R;
|
||||
K_REPLACE: R E P L A C E;
|
||||
K_DETERMINISTIC: D E T E R M I N I S T I C;
|
||||
K_JSON: J S O N;
|
||||
K_DEFAULT: D E F A U L T;
|
||||
K_UNSET: U N S E T;
|
||||
|
||||
K_EMPTY: E M P T Y;
|
||||
|
||||
|
||||
@@ -130,6 +130,18 @@ query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<ser
|
||||
|
||||
}
|
||||
|
||||
query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size)
|
||||
: query_options(qo->_consistency,
|
||||
qo->get_timeout_config(),
|
||||
std::move(qo->_names),
|
||||
std::move(qo->_values),
|
||||
std::move(qo->_value_views),
|
||||
qo->_skip_metadata,
|
||||
std::move(query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}),
|
||||
qo->_cql_serialization_format) {
|
||||
|
||||
}
|
||||
|
||||
query_options::query_options(std::vector<cql3::raw_value> values)
|
||||
: query_options(
|
||||
db::consistency_level::ONE, infinite_timeout_config, std::move(values))
|
||||
|
||||
@@ -102,7 +102,7 @@ private:
|
||||
|
||||
public:
|
||||
query_options(query_options&&) = default;
|
||||
query_options(const query_options&) = delete;
|
||||
explicit query_options(const query_options&) = default;
|
||||
|
||||
explicit query_options(db::consistency_level consistency,
|
||||
const timeout_config& timeouts,
|
||||
@@ -155,6 +155,7 @@ public:
|
||||
explicit query_options(db::consistency_level, const timeout_config& timeouts,
|
||||
std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
|
||||
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
|
||||
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);
|
||||
|
||||
const timeout_config& get_timeout_config() const { return _timeout_config; }
|
||||
|
||||
|
||||
@@ -214,11 +214,9 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
}
|
||||
auto& cf = db.find_column_family(schema);
|
||||
auto& sim = cf.get_index_manager();
|
||||
bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
|
||||
bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
|
||||
bool has_queriable_index = has_queriable_clustering_column_index
|
||||
|| has_queriable_pk_index
|
||||
|| _nonprimary_key_restrictions->has_supporting_index(sim);
|
||||
const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
|
||||
const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
|
||||
const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim);
|
||||
|
||||
// At this point, the select statement if fully constructed, but we still have a few things to validate
|
||||
process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
|
||||
@@ -279,7 +277,7 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
}
|
||||
|
||||
if (!_nonprimary_key_restrictions->empty()) {
|
||||
if (has_queriable_index) {
|
||||
if (has_queriable_regular_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
@@ -365,8 +363,9 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
|
||||
}
|
||||
}
|
||||
}
|
||||
if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
|
||||
const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
|
||||
if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
|
||||
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
|
||||
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||
if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
|
||||
@@ -481,10 +480,9 @@ bool statement_restrictions::need_filtering() const {
|
||||
int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
|
||||
// If the whole partition key is restricted, it does not imply filtering
|
||||
if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
|
||||
number_of_filtering_restrictions += _partition_key_restrictions->size();
|
||||
if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
|
||||
number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
|
||||
}
|
||||
number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
|
||||
} else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
|
||||
number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
|
||||
}
|
||||
|
||||
if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
|
||||
|
||||
@@ -395,6 +395,14 @@ public:
|
||||
return !_nonprimary_key_restrictions->empty();
|
||||
}
|
||||
|
||||
bool pk_restrictions_need_filtering() const {
|
||||
return _partition_key_restrictions->needs_filtering(*_schema);
|
||||
}
|
||||
|
||||
bool ck_restrictions_need_filtering() const {
|
||||
return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if column is restricted by some restriction, false otherwise
|
||||
*/
|
||||
|
||||
@@ -83,6 +83,9 @@ void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_
|
||||
assert(paging_state);
|
||||
if (paging_state->get_remaining() > 0) {
|
||||
set_paging_state(std::move(paging_state));
|
||||
} else {
|
||||
_flags.remove<flag::HAS_MORE_PAGES>();
|
||||
_paging_state = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
|
||||
selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
|
||||
auto&& factory = _selected->new_selector_factory(db, s, defs);
|
||||
auto&& type = factory->new_instance()->get_type();
|
||||
auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
|
||||
auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
|
||||
if (!ut) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
sprint("Invalid field selection: %s of type %s is not a user type",
|
||||
|
||||
@@ -165,7 +165,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
|
||||
user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
|
||||
{
|
||||
if (get_idx_of_field(to_update, _field_name)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->to_string(), _name.to_string()));
|
||||
}
|
||||
|
||||
std::vector<bytes> new_names(to_update->field_names());
|
||||
@@ -173,7 +173,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
|
||||
std::vector<data_type> new_types(to_update->field_types());
|
||||
auto&& add_type = _field_type->prepare(db, keyspace())->get_type();
|
||||
if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
|
||||
}
|
||||
new_types.push_back(std::move(add_type));
|
||||
return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
|
||||
@@ -183,13 +183,13 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
|
||||
{
|
||||
stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
|
||||
if (!idx) {
|
||||
throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->to_string(), _name.to_string()));
|
||||
}
|
||||
|
||||
auto previous = to_update->field_types()[*idx];
|
||||
auto new_type = _field_type->prepare(db, keyspace())->get_type();
|
||||
if (!new_type->is_compatible_with(*previous)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->to_string(), _name.to_string()));
|
||||
}
|
||||
|
||||
std::vector<data_type> new_types(to_update->field_types());
|
||||
|
||||
@@ -87,6 +87,7 @@ private:
|
||||
::shared_ptr<attributes::raw> _attrs;
|
||||
::shared_ptr<term::raw> _json_value;
|
||||
bool _if_not_exists;
|
||||
bool _default_unset;
|
||||
public:
|
||||
/**
|
||||
* A parsed <code>INSERT JSON</code> statement.
|
||||
@@ -95,7 +96,7 @@ public:
|
||||
* @param json_value JSON string representing names and values
|
||||
* @param attrs additional attributes for statement (CL, timestamp, timeToLive)
|
||||
*/
|
||||
insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
|
||||
insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);
|
||||
|
||||
virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
|
||||
::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
|
||||
|
||||
@@ -522,8 +522,8 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio
|
||||
return cmd;
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
indexed_table_select_statement::do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
@@ -582,22 +582,27 @@ indexed_table_select_statement::execute_base_query(
|
||||
}).then([&merger]() {
|
||||
return merger.get();
|
||||
});
|
||||
}).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||
return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
|
||||
}).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
|
||||
});
|
||||
}
|
||||
|
||||
// Function for fetching the selected columns from a list of clustering rows.
|
||||
// It is currently used only in our Secondary Index implementation - ordinary
|
||||
// CQL SELECT statements do not have the syntax to request a list of rows.
|
||||
// FIXME: The current implementation is very inefficient - it requests each
|
||||
// row separately (and, incrementally, in parallel). Even multiple rows from a single
|
||||
// partition are requested separately. This last case can be easily improved,
|
||||
// but to implement the general case (multiple rows from multiple partitions)
|
||||
// efficiently, we will need more support from other layers.
|
||||
// Keys are ordered in token order (see #3423)
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
return do_execute_base_query(proxy, std::move(partition_ranges), state, options, now, paging_state).then(
|
||||
[this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
|
||||
return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
|
||||
});
|
||||
}
|
||||
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
indexed_table_select_statement::do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
@@ -652,9 +657,23 @@ indexed_table_select_statement::execute_base_query(
|
||||
});
|
||||
}).then([&merger] () {
|
||||
return merger.get();
|
||||
}).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
|
||||
});
|
||||
}).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||
return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
|
||||
});
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
return do_execute_base_query(proxy, std::move(primary_keys), state, options, now, paging_state).then(
|
||||
[this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
|
||||
return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -929,6 +948,60 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregated and paged filtering needs to aggregate the results from all pages
|
||||
// in order to avoid returning partial per-page results (issue #4540).
|
||||
// It's a little bit more complicated than regular aggregation, because each paging state
|
||||
// needs to be translated between the base table and the underlying view.
|
||||
// The routine below keeps fetching pages from the underlying view, which are then
|
||||
// used to fetch base rows, which go straight to the result set builder.
|
||||
// A local, internal copy of query_options is kept in order to keep updating
|
||||
// the paging state between requesting data from replicas.
|
||||
const bool aggregate = _selection->is_aggregate();
|
||||
if (aggregate) {
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
|
||||
[this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
|
||||
// page size is set to the internal count page size, regardless of the user-provided value
|
||||
internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
|
||||
return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
|
||||
auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
|
||||
if (restrictions_need_filtering) {
|
||||
query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
|
||||
cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
|
||||
} else {
|
||||
query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
|
||||
}
|
||||
};
|
||||
|
||||
if (whole_partitions || partition_slices) {
|
||||
return find_index_partition_ranges(proxy, state, *internal_options).then(
|
||||
[this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
|
||||
internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
|
||||
return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
|
||||
return stop_iteration(!has_more_pages);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
return find_index_clustering_rows(proxy, state, *internal_options).then(
|
||||
[this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
|
||||
internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
|
||||
return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
|
||||
return stop_iteration(!has_more_pages);
|
||||
});
|
||||
});
|
||||
}
|
||||
}).then([this, &builder, restrictions_need_filtering] () {
|
||||
auto rs = builder.build();
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (whole_partitions || partition_slices) {
|
||||
// In this case, can use our normal query machinery, which retrieves
|
||||
// entire partitions or the same slice for many partitions.
|
||||
|
||||
@@ -67,8 +67,8 @@ class select_statement : public cql_statement {
|
||||
public:
|
||||
using parameters = raw::select_statement::parameters;
|
||||
using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
|
||||
protected:
|
||||
static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
|
||||
protected:
|
||||
static thread_local const ::shared_ptr<parameters> _default_parameters;
|
||||
schema_ptr _schema;
|
||||
uint32_t _bound_terms;
|
||||
@@ -213,6 +213,14 @@ private:
|
||||
lw_shared_ptr<query::read_command>
|
||||
prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
|
||||
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
@@ -222,6 +230,23 @@ private:
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
// Function for fetching the selected columns from a list of clustering rows.
|
||||
// It is currently used only in our Secondary Index implementation - ordinary
|
||||
// CQL SELECT statements do not have the syntax to request a list of rows.
|
||||
// FIXME: The current implementation is very inefficient - it requests each
|
||||
// row separately (and, incrementally, in parallel). Even multiple rows from a single
|
||||
// partition are requested separately. This last case can be easily improved,
|
||||
// but to implement the general case (multiple rows from multiple partitions)
|
||||
// efficiently, we will need more support from other layers.
|
||||
// Keys are ordered in token order (see #3423)
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
|
||||
@@ -84,8 +84,11 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
|
||||
for (const auto& def : expected_receivers) {
|
||||
sstring cql_name = def.name_as_text();
|
||||
auto value_it = prepared_map.find(cql_name);
|
||||
if (value_it == prepared_map.end() || value_it->second.isNull()) {
|
||||
if (value_it == prepared_map.end()) {
|
||||
continue;
|
||||
} else if (value_it->second.isNull()) {
|
||||
json_map.emplace(std::move(cql_name), bytes_opt{});
|
||||
prepared_map.erase(value_it);
|
||||
} else {
|
||||
json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
|
||||
prepared_map.erase(value_it);
|
||||
@@ -255,8 +258,12 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
|
||||
}
|
||||
|
||||
auto value = json_cache->at(def.name_as_text());
|
||||
execute_set_value(m, prefix, params, def, value);
|
||||
auto it = json_cache->find(def.name_as_text());
|
||||
if (it != json_cache->end()) {
|
||||
execute_set_value(m, prefix, params, def, it->second);
|
||||
} else if (!_default_unset) {
|
||||
execute_set_value(m, prefix, params, def, bytes_opt{});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -322,12 +329,14 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
|
||||
insert_json_statement::insert_json_statement( ::shared_ptr<cf_name> name,
|
||||
::shared_ptr<attributes::raw> attrs,
|
||||
::shared_ptr<term::raw> json_value,
|
||||
bool if_not_exists)
|
||||
bool if_not_exists,
|
||||
bool default_unset)
|
||||
: raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
|
||||
, _name(name)
|
||||
, _attrs(attrs)
|
||||
, _json_value(json_value)
|
||||
, _if_not_exists(if_not_exists) { }
|
||||
, _if_not_exists(if_not_exists)
|
||||
, _default_unset(default_unset) { }
|
||||
|
||||
::shared_ptr<cql3::statements::modification_statement>
|
||||
insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
||||
@@ -337,7 +346,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
||||
auto json_column_placeholder = ::make_shared<column_identifier>("", true);
|
||||
auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
|
||||
prepared_json_value->collect_marker_specification(bound_names);
|
||||
return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
|
||||
return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
|
||||
}
|
||||
|
||||
update_statement::update_statement( ::shared_ptr<cf_name> name,
|
||||
|
||||
@@ -82,9 +82,10 @@ private:
|
||||
*/
|
||||
class insert_prepared_json_statement : public update_statement {
|
||||
::shared_ptr<term> _term;
|
||||
bool _default_unset;
|
||||
public:
|
||||
insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
|
||||
: update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
|
||||
insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
|
||||
: update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
|
||||
_restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
|
||||
}
|
||||
private:
|
||||
|
||||
@@ -54,7 +54,7 @@ public:
|
||||
column->ks_name,
|
||||
column->cf_name,
|
||||
::make_shared<column_identifier>(sprint("%s[%d]", column->name, component), true),
|
||||
static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
|
||||
static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -112,7 +112,7 @@ public:
|
||||
|
||||
private:
|
||||
void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
|
||||
auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
|
||||
auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
|
||||
if (!tt) {
|
||||
throw exceptions::invalid_request_exception(sprint("Invalid tuple type literal for %s of type %s", receiver->name, receiver->type->as_cql3_type()));
|
||||
}
|
||||
|
||||
19
database.cc
19
database.cc
@@ -1513,7 +1513,8 @@ future<> table::cleanup_sstables(sstables::compaction_descriptor descriptor) {
|
||||
return with_semaphore(sem, 1, [this, &sst] {
|
||||
// release reference to sstables cleaned up, otherwise space usage from their data and index
|
||||
// components cannot be reclaimed until all of them are cleaned.
|
||||
return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sst->get_sstable_level()), true);
|
||||
auto sstable_level = sst->get_sstable_level();
|
||||
return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sstable_level), true);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -2232,6 +2233,10 @@ void backlog_controller::adjust() {
|
||||
|
||||
float backlog_controller::backlog_of_shares(float shares) const {
|
||||
size_t idx = 1;
|
||||
// No control points means the controller is disabled.
|
||||
if (_control_points.size() == 0) {
|
||||
return 1.0f;
|
||||
}
|
||||
while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
|
||||
idx++;
|
||||
}
|
||||
@@ -4356,6 +4361,8 @@ future<int64_t>
|
||||
table::disable_sstable_write() {
|
||||
_sstable_writes_disabled_at = std::chrono::steady_clock::now();
|
||||
return _sstables_lock.write_lock().then([this] {
|
||||
// _sstable_deletion_sem must be acquired after _sstables_lock.write_lock
|
||||
return _sstable_deletion_sem.wait().then([this] {
|
||||
if (_sstables->all()->empty()) {
|
||||
return make_ready_future<int64_t>(0);
|
||||
}
|
||||
@@ -4364,9 +4371,19 @@ table::disable_sstable_write() {
|
||||
max = std::max(max, s->generation());
|
||||
}
|
||||
return make_ready_future<int64_t>(max);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
std::chrono::steady_clock::duration table::enable_sstable_write(int64_t new_generation) {
|
||||
if (new_generation != -1) {
|
||||
update_sstables_known_generation(new_generation);
|
||||
}
|
||||
_sstable_deletion_sem.signal();
|
||||
_sstables_lock.write_unlock();
|
||||
return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
|
||||
os << "org.apache.cassandra.config.UTMetaData@" << &m;
|
||||
return os;
|
||||
|
||||
11
database.hh
11
database.hh
@@ -447,6 +447,7 @@ private:
|
||||
// This semaphore ensures that an operation like snapshot won't have its selected
|
||||
// sstables deleted by compaction in parallel, a race condition which could
|
||||
// easily result in failure.
|
||||
// Locking order: must be acquired either independently or after _sstables_lock
|
||||
seastar::semaphore _sstable_deletion_sem = {1};
|
||||
// There are situations in which we need to stop writing sstables. Flushers will take
|
||||
// the read lock, and the ones that wish to stop that process will take the write lock.
|
||||
@@ -737,13 +738,7 @@ public:
|
||||
|
||||
// SSTable writes are now allowed again, and generation is updated to new_generation if != -1
|
||||
// returns the amount of microseconds elapsed since we disabled writes.
|
||||
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
|
||||
if (new_generation != -1) {
|
||||
update_sstables_known_generation(new_generation);
|
||||
}
|
||||
_sstables_lock.write_unlock();
|
||||
return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
|
||||
}
|
||||
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);
|
||||
|
||||
// Make sure the generation numbers are sequential, starting from "start".
|
||||
// Generations before "start" are left untouched.
|
||||
@@ -897,7 +892,7 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
|
||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
|
||||
@@ -395,10 +395,8 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons
|
||||
|
||||
// grab a random member of up to two racks
|
||||
for (auto& rack : racks) {
|
||||
auto rack_members = validated.bucket(rack);
|
||||
auto n = validated.bucket_size(rack_members);
|
||||
auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
|
||||
std::uniform_int_distribution<size_t> rdist(0, n - 1);
|
||||
std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
|
||||
result.emplace(cpy[rdist(_e1)]);
|
||||
}
|
||||
|
||||
|
||||
@@ -689,6 +689,8 @@ public:
|
||||
// but all previous write/flush pairs.
|
||||
return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
|
||||
auto view = fragmented_temporary_buffer::view(buf);
|
||||
view.remove_suffix(buf.size_bytes() - size);
|
||||
assert(size == view.size_bytes());
|
||||
return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
|
||||
if (view.empty()) {
|
||||
return make_ready_future<>();
|
||||
@@ -1187,6 +1189,34 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Helper for ensuring a file is closed if an exception is thrown.
|
||||
///
|
||||
/// The file provided by the file_fut future is passed to func.
|
||||
/// * If func throws an exception E, the file is closed and we return
|
||||
/// a failed future with E.
|
||||
/// * If func returns a value V, the file is not closed and we return
|
||||
/// a future with V.
|
||||
/// Note that when an exception is not thrown, it is the
|
||||
/// responsibility of func to make sure the file will be closed. It
|
||||
/// can close the file itself, return it, or store it somewhere.
|
||||
///
|
||||
/// \tparam Func The type of function this wraps
|
||||
/// \param file_fut A future that produces a file
|
||||
/// \param func A function that uses a file
|
||||
/// \return A future that passes the file produced by file_fut to func
|
||||
/// and closes it if func fails
|
||||
template <typename Func>
|
||||
static auto close_on_failure(future<file> file_fut, Func func) {
|
||||
return file_fut.then([func = std::move(func)](file f) {
|
||||
return futurize_apply(func, f).handle_exception([f] (std::exception_ptr e) mutable {
|
||||
return f.close().then_wrapped([f, e = std::move(e)] (future<> x) {
|
||||
using futurator = futurize<std::result_of_t<Func(file)>>;
|
||||
return futurator::make_exception_future(e);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
|
||||
static const auto flags = open_flags::wo | open_flags::create;
|
||||
|
||||
@@ -1217,7 +1247,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
return fut;
|
||||
});
|
||||
|
||||
return fut.then([this, d, active, filename](file f) {
|
||||
return close_on_failure(std::move(fut), [this, d, active, filename] (file f) {
|
||||
f = make_checked_file(commit_error_handler, f);
|
||||
// xfs doesn't like files extended betond eof, so enlarge the file
|
||||
return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
|
||||
@@ -1755,7 +1785,7 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
|
||||
}
|
||||
|
||||
if (magic != segment::segment_magic) {
|
||||
throw std::invalid_argument("Not a scylla format commitlog file");
|
||||
throw invalid_segment_format();
|
||||
}
|
||||
crc32_nbo crc;
|
||||
crc.process(ver);
|
||||
@@ -1764,7 +1794,7 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
|
||||
|
||||
auto cs = crc.checksum();
|
||||
if (cs != checksum) {
|
||||
throw std::runtime_error("Checksum error in file header");
|
||||
throw header_checksum_error();
|
||||
}
|
||||
|
||||
this->id = id;
|
||||
|
||||
@@ -342,18 +342,40 @@ public:
|
||||
|
||||
typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;
|
||||
|
||||
class segment_data_corruption_error: public std::runtime_error {
|
||||
class segment_error : public std::exception {};
|
||||
|
||||
class segment_data_corruption_error: public segment_error {
|
||||
std::string _msg;
|
||||
public:
|
||||
segment_data_corruption_error(std::string msg, uint64_t s)
|
||||
: std::runtime_error(msg), _bytes(s) {
|
||||
: _msg(std::move(msg)), _bytes(s) {
|
||||
}
|
||||
uint64_t bytes() const {
|
||||
return _bytes;
|
||||
}
|
||||
virtual const char* what() const noexcept {
|
||||
return _msg.c_str();
|
||||
}
|
||||
private:
|
||||
uint64_t _bytes;
|
||||
};
|
||||
|
||||
class invalid_segment_format : public segment_error {
|
||||
static constexpr const char* _msg = "Not a scylla format commitlog file";
|
||||
public:
|
||||
virtual const char* what() const noexcept {
|
||||
return _msg;
|
||||
}
|
||||
};
|
||||
|
||||
class header_checksum_error : public segment_error {
|
||||
static constexpr const char* _msg = "Checksum error in file header";
|
||||
public:
|
||||
virtual const char* what() const noexcept {
|
||||
return _msg;
|
||||
}
|
||||
};
|
||||
|
||||
static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
|
||||
const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
private:
|
||||
|
||||
@@ -164,7 +164,7 @@ future<> db::commitlog_replayer::impl::init() {
|
||||
// Get all truncation records for the CF and initialize max rps if
|
||||
// present. Cannot do this on demand, as there may be no sstables to
|
||||
// mark the CF as "needed".
|
||||
return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
|
||||
return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
|
||||
for (auto& p : tpps) {
|
||||
rlogger.trace("CF {} truncated at {}", uuid, p);
|
||||
auto& pp = map[p.shard_id()][uuid];
|
||||
|
||||
@@ -102,6 +102,8 @@ db::config::config()
|
||||
db::config::~config()
|
||||
{}
|
||||
|
||||
const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
|
||||
|
||||
namespace utils {
|
||||
|
||||
template<>
|
||||
|
||||
@@ -743,6 +743,7 @@ public:
|
||||
val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
|
||||
val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
|
||||
val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
|
||||
val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
|
||||
/* done! */
|
||||
|
||||
#define _make_value_member(name, type, deflt, status, desc, ...) \
|
||||
@@ -756,6 +757,8 @@ public:
|
||||
add_options(boost::program_options::options_description_easy_init&);
|
||||
|
||||
const db::extensions& extensions() const;
|
||||
|
||||
static const sstring default_tls_priority;
|
||||
private:
|
||||
template<typename T>
|
||||
struct log_legacy_value : public named_value<T, value_status::Used> {
|
||||
|
||||
@@ -82,6 +82,9 @@ void manager::register_metrics(const sstring& group_name) {
|
||||
|
||||
sm::make_derive("discarded", _stats.discarded,
|
||||
sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
|
||||
|
||||
sm::make_derive("corrupted_files", _stats.corrupted_files,
|
||||
sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -114,8 +117,8 @@ future<> manager::stop() {
|
||||
|
||||
return _draining_eps_gate.close().finally([this] {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop();
|
||||
}).finally([this] {
|
||||
return pair.second.stop();
|
||||
}).finally([this] {
|
||||
_ep_managers.clear();
|
||||
manager_logger.info("Stopped");
|
||||
}).discard_result();
|
||||
@@ -236,6 +239,8 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
|
||||
manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
|
||||
: _key(key)
|
||||
, _shard_manager(shard_manager)
|
||||
, _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
|
||||
, _file_update_mutex(*_file_update_mutex_ptr)
|
||||
, _state(state_set::of<state::stopped>())
|
||||
, _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
|
||||
, _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
|
||||
@@ -244,6 +249,8 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
|
||||
manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
|
||||
: _key(other._key)
|
||||
, _shard_manager(other._shard_manager)
|
||||
, _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
|
||||
, _file_update_mutex(*_file_update_mutex_ptr)
|
||||
, _state(other._state)
|
||||
, _hints_dir(std::move(other._hints_dir))
|
||||
, _sender(other._sender, *this)
|
||||
@@ -513,28 +520,35 @@ void manager::drain_for(gms::inet_address endpoint) {
|
||||
manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);
|
||||
|
||||
with_gate(_draining_eps_gate, [this, endpoint] {
|
||||
return futurize_apply([this, endpoint] () {
|
||||
if (utils::fb_utilities::is_me(endpoint)) {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop(drain::yes).finally([&pair] {
|
||||
return remove_file(pair.second.hints_dir().c_str());
|
||||
return with_semaphore(drain_lock(), 1, [this, endpoint] {
|
||||
return futurize_apply([this, endpoint] () {
|
||||
if (utils::fb_utilities::is_me(endpoint)) {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop(drain::yes).finally([&pair] {
|
||||
return with_file_update_mutex(pair.second, [&pair] {
|
||||
return remove_file(pair.second.hints_dir().c_str());
|
||||
});
|
||||
});
|
||||
}).finally([this] {
|
||||
_ep_managers.clear();
|
||||
});
|
||||
}).finally([this] {
|
||||
_ep_managers.clear();
|
||||
});
|
||||
} else {
|
||||
ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
|
||||
if (ep_manager_it != ep_managers_end()) {
|
||||
return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
|
||||
_ep_managers.erase(endpoint);
|
||||
return remove_file(hints_dir.c_str());
|
||||
});
|
||||
}
|
||||
} else {
|
||||
ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
|
||||
if (ep_manager_it != ep_managers_end()) {
|
||||
return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
|
||||
return with_file_update_mutex(ep_man, [&ep_man] {
|
||||
return remove_file(ep_man.hints_dir().c_str());
|
||||
}).finally([this, endpoint] {
|
||||
_ep_managers.erase(endpoint);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}).handle_exception([endpoint] (auto eptr) {
|
||||
manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}).handle_exception([endpoint] (auto eptr) {
|
||||
manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -725,6 +739,10 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
||||
}, _last_not_complete_rp.pos, &_db.get_config().extensions()).get0();
|
||||
|
||||
s->done().get();
|
||||
} catch (db::commitlog::segment_error& ex) {
|
||||
manager_logger.error("{}: {}. Dropping...", fname, ex.what());
|
||||
ctx_ptr->state.remove(send_state::segment_replay_failed);
|
||||
++this->shard_stats().corrupted_files;
|
||||
} catch (...) {
|
||||
manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
|
||||
ctx_ptr->state.set(send_state::segment_replay_failed);
|
||||
@@ -959,8 +977,6 @@ future<> manager::rebalance(sstring hints_directory) {
|
||||
}
|
||||
|
||||
void manager::update_backlog(size_t backlog, size_t max_backlog) {
|
||||
_backlog_size = backlog;
|
||||
_max_backlog_size = max_backlog;
|
||||
if (backlog < max_backlog) {
|
||||
allow_hints();
|
||||
} else {
|
||||
|
||||
@@ -60,6 +60,7 @@ private:
|
||||
uint64_t dropped = 0;
|
||||
uint64_t sent = 0;
|
||||
uint64_t discarded = 0;
|
||||
uint64_t corrupted_files = 0;
|
||||
};
|
||||
|
||||
// map: shard -> segments
|
||||
@@ -274,7 +275,8 @@ public:
|
||||
manager& _shard_manager;
|
||||
hints_store_ptr _hints_store_anchor;
|
||||
seastar::gate _store_gate;
|
||||
seastar::shared_mutex _file_update_mutex;
|
||||
lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
|
||||
seastar::shared_mutex& _file_update_mutex;
|
||||
|
||||
enum class state {
|
||||
can_hint, // hinting is currently allowed (used by the space_watchdog)
|
||||
@@ -376,8 +378,20 @@ public:
|
||||
return _state.contains(state::stopped);
|
||||
}
|
||||
|
||||
seastar::shared_mutex& file_update_mutex() {
|
||||
return _file_update_mutex;
|
||||
/// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
|
||||
///
|
||||
/// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
|
||||
/// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
|
||||
/// (as long as the \ref func call itself is safe).
|
||||
///
|
||||
/// \tparam Func Functor type.
|
||||
/// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
|
||||
/// \param func Functor to run under the lock.
|
||||
/// \return Whatever \ref func returns.
|
||||
template <typename Func>
|
||||
friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
|
||||
lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
|
||||
return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
|
||||
}
|
||||
|
||||
const boost::filesystem::path& hints_dir() const noexcept {
|
||||
@@ -385,6 +399,10 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
seastar::shared_mutex& file_update_mutex() noexcept {
|
||||
return _file_update_mutex;
|
||||
}
|
||||
|
||||
/// \brief Creates a new hints store object.
|
||||
///
|
||||
/// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
|
||||
@@ -451,9 +469,7 @@ private:
|
||||
stats _stats;
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
std::unordered_set<ep_key_type> _eps_with_pending_hints;
|
||||
|
||||
size_t _max_backlog_size = 1;
|
||||
size_t _backlog_size = 0;
|
||||
seastar::semaphore _drain_lock = {1};
|
||||
|
||||
public:
|
||||
manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
|
||||
@@ -532,18 +548,14 @@ public:
|
||||
return _hints_dir_device_id;
|
||||
}
|
||||
|
||||
seastar::semaphore& drain_lock() noexcept {
|
||||
return _drain_lock;
|
||||
}
|
||||
|
||||
void allow_hints();
|
||||
void forbid_hints();
|
||||
void forbid_hints_for_eps_with_pending_hints();
|
||||
|
||||
size_t max_backlog_size() const {
|
||||
return _max_backlog_size;
|
||||
}
|
||||
|
||||
size_t backlog_size() const {
|
||||
return _backlog_size;
|
||||
}
|
||||
|
||||
void allow_replaying() noexcept {
|
||||
_state.set(state::replay_allowed);
|
||||
}
|
||||
|
||||
@@ -90,16 +90,27 @@ future<> space_watchdog::stop() noexcept {
|
||||
return std::move(_started);
|
||||
}
|
||||
|
||||
// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
|
||||
future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
|
||||
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
|
||||
if (_files_count == 1) {
|
||||
shard_manager.add_ep_with_pending_hints(ep_key);
|
||||
}
|
||||
++_files_count;
|
||||
return do_with(std::move(path), [this, ep_key, &shard_manager] (boost::filesystem::path& path) {
|
||||
// It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
|
||||
// In this case simply bail out.
|
||||
return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
|
||||
if (!exists) {
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
|
||||
if (_files_count == 1) {
|
||||
shard_manager.add_ep_with_pending_hints(ep_key);
|
||||
}
|
||||
++_files_count;
|
||||
|
||||
return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
|
||||
_total_size += fsize;
|
||||
return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
|
||||
_total_size += fsize;
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -137,7 +148,7 @@ void space_watchdog::on_timer() {
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
||||
return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
});
|
||||
} else {
|
||||
|
||||
@@ -598,7 +598,7 @@ public:
|
||||
|
||||
future<> flush_schemas() {
|
||||
return _qp.proxy().get_db().invoke_on_all([this] (database& db) {
|
||||
return parallel_for_each(db::schema_tables::ALL, [this, &db](const sstring& cf_name) {
|
||||
return parallel_for_each(db::schema_tables::all_table_names(), [this, &db](const sstring& cf_name) {
|
||||
auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
|
||||
return cf.flush();
|
||||
});
|
||||
|
||||
@@ -143,10 +143,10 @@ struct qualified_name {
|
||||
static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s);
|
||||
|
||||
static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
std::map<qualified_name, schema_mutations>&& tables_before,
|
||||
std::map<qualified_name, schema_mutations>&& tables_after,
|
||||
std::map<qualified_name, schema_mutations>&& views_before,
|
||||
std::map<qualified_name, schema_mutations>&& views_after);
|
||||
std::map<utils::UUID, schema_mutations>&& tables_before,
|
||||
std::map<utils::UUID, schema_mutations>&& tables_after,
|
||||
std::map<utils::UUID, schema_mutations>&& views_before,
|
||||
std::map<utils::UUID, schema_mutations>&& views_after);
|
||||
|
||||
struct user_types_to_drop final {
|
||||
seastar::noncopyable_function<void()> drop;
|
||||
@@ -194,8 +194,6 @@ static void prepare_builder_from_table_row(const schema_ctxt&, schema_builder&,
|
||||
|
||||
using namespace v3;
|
||||
|
||||
std::vector<const char*> ALL { KEYSPACES, TABLES, SCYLLA_TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };
|
||||
|
||||
using days = std::chrono::duration<int, std::ratio<24 * 3600>>;
|
||||
|
||||
future<> save_system_schema(const sstring & ksname) {
|
||||
@@ -203,7 +201,7 @@ future<> save_system_schema(const sstring & ksname) {
|
||||
auto ksm = ks.metadata();
|
||||
|
||||
// delete old, possibly obsolete entries in schema tables
|
||||
return parallel_for_each(ALL, [ksm] (sstring cf) {
|
||||
return parallel_for_each(all_table_names(), [ksm] (sstring cf) {
|
||||
auto deletion_timestamp = schema_creation_timestamp() - 1;
|
||||
return db::execute_cql(sprint("DELETE FROM %s.%s USING TIMESTAMP %s WHERE keyspace_name = ?", NAME, cf,
|
||||
deletion_timestamp), ksm->name()).discard_result();
|
||||
@@ -598,7 +596,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
|
||||
}
|
||||
};
|
||||
return do_with(md5_hasher(), [map, reduce] (auto& hash) {
|
||||
return do_for_each(ALL.begin(), ALL.end(), [&hash, map, reduce] (auto& table) {
|
||||
return do_for_each(all_table_names(), [&hash, map, reduce] (auto& table) {
|
||||
return map(table).then([&hash, reduce] (auto&& mutations) {
|
||||
reduce(hash, mutations);
|
||||
});
|
||||
@@ -629,7 +627,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
|
||||
std::move(mutations.begin(), mutations.end(), std::back_inserter(result));
|
||||
return std::move(result);
|
||||
};
|
||||
return map_reduce(ALL.begin(), ALL.end(), map, std::vector<frozen_mutation>{}, reduce);
|
||||
return map_reduce(all_table_names(), map, std::vector<frozen_mutation>{}, reduce);
|
||||
}
|
||||
|
||||
future<schema_result>
|
||||
@@ -703,33 +701,7 @@ read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring
|
||||
static semaphore the_merge_lock {1};
|
||||
|
||||
future<> merge_lock() {
|
||||
// ref: #1088
|
||||
// to avoid deadlocks, we don't want long-standing calls to the shard 0
|
||||
// as they can cause a deadlock:
|
||||
//
|
||||
// fiber1 fiber2
|
||||
// merge_lock() (succeeds)
|
||||
// merge_lock() (waits)
|
||||
// invoke_on_all() (waits on merge_lock to relinquish smp::submit_to slot)
|
||||
//
|
||||
// so we issue the lock calls with a timeout; the slot will be relinquished, and invoke_on_all()
|
||||
// can complete
|
||||
return repeat([] () mutable {
|
||||
return smp::submit_to(0, [] {
|
||||
return the_merge_lock.try_wait();
|
||||
}).then([] (bool result) {
|
||||
if (result) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
} else {
|
||||
static thread_local auto rand_engine = std::default_random_engine();
|
||||
auto dist = std::uniform_int_distribution<int>(0, 100);
|
||||
auto to = std::chrono::microseconds(dist(rand_engine));
|
||||
return sleep(to).then([] {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
return smp::submit_to(0, [] { return the_merge_lock.wait(); });
|
||||
}
|
||||
|
||||
future<> merge_unlock() {
|
||||
@@ -777,16 +749,24 @@ static read_table_names_of_keyspace(distributed<service::storage_proxy>& proxy,
|
||||
});
|
||||
}
|
||||
|
||||
static utils::UUID table_id_from_mutations(const schema_mutations& sm) {
|
||||
auto table_rs = query::result_set(sm.columnfamilies_mutation());
|
||||
query::result_set_row table_row = table_rs.row(0);
|
||||
return table_row.get_nonnull<utils::UUID>("id");
|
||||
}
|
||||
|
||||
// Call inside a seastar thread
|
||||
static
|
||||
std::map<qualified_name, schema_mutations>
|
||||
std::map<utils::UUID, schema_mutations>
|
||||
read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, schema_ptr s)
|
||||
{
|
||||
std::map<qualified_name, schema_mutations> result;
|
||||
std::map<utils::UUID, schema_mutations> result;
|
||||
for (auto&& keyspace_name : keyspace_names) {
|
||||
for (auto&& table_name : read_table_names_of_keyspace(proxy, keyspace_name, s).get0()) {
|
||||
auto qn = qualified_name(keyspace_name, table_name);
|
||||
result.emplace(qn, read_table_mutations(proxy, qn, s).get0());
|
||||
auto muts = read_table_mutations(proxy, qn, s).get0();
|
||||
auto id = table_id_from_mutations(muts);
|
||||
result.emplace(std::move(id), std::move(muts));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
@@ -956,14 +936,14 @@ struct schema_diff {
|
||||
|
||||
template<typename CreateSchema>
|
||||
static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy,
|
||||
std::map<qualified_name, schema_mutations>&& before,
|
||||
std::map<qualified_name, schema_mutations>&& after,
|
||||
std::map<utils::UUID, schema_mutations>&& before,
|
||||
std::map<utils::UUID, schema_mutations>&& after,
|
||||
CreateSchema&& create_schema)
|
||||
{
|
||||
schema_diff d;
|
||||
auto diff = difference(before, after);
|
||||
for (auto&& key : diff.entries_only_on_left) {
|
||||
auto&& s = proxy.local().get_db().local().find_schema(key.keyspace_name, key.table_name);
|
||||
auto&& s = proxy.local().get_db().local().find_schema(key);
|
||||
slogger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
|
||||
d.dropped.emplace_back(schema_diff::dropped_schema{s});
|
||||
}
|
||||
@@ -986,10 +966,10 @@ static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy
|
||||
// upon an alter table or alter type statement), then they are published together
|
||||
// as well, without any deferring in-between.
|
||||
static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
std::map<qualified_name, schema_mutations>&& tables_before,
|
||||
std::map<qualified_name, schema_mutations>&& tables_after,
|
||||
std::map<qualified_name, schema_mutations>&& views_before,
|
||||
std::map<qualified_name, schema_mutations>&& views_after)
|
||||
std::map<utils::UUID, schema_mutations>&& tables_before,
|
||||
std::map<utils::UUID, schema_mutations>&& tables_after,
|
||||
std::map<utils::UUID, schema_mutations>&& views_before,
|
||||
std::map<utils::UUID, schema_mutations>&& views_after)
|
||||
{
|
||||
auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), [&] (auto&& sm) {
|
||||
return create_table_from_mutations(proxy, std::move(sm));
|
||||
@@ -1000,6 +980,10 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
|
||||
proxy.local().get_db().invoke_on_all([&] (database& db) {
|
||||
return seastar::async([&] {
|
||||
parallel_for_each(boost::range::join(tables_diff.dropped, views_diff.dropped), [&] (schema_diff::dropped_schema& dt) {
|
||||
auto& s = *dt.schema.get();
|
||||
return db.drop_column_family(s.ks_name(), s.cf_name(), [&] { return dt.jp.value(); });
|
||||
}).get();
|
||||
parallel_for_each(boost::range::join(tables_diff.created, views_diff.created), [&] (global_schema_ptr& gs) {
|
||||
return db.add_column_family_and_make_directory(gs);
|
||||
}).get();
|
||||
@@ -1011,10 +995,6 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
for (auto&& gs : boost::range::join(tables_diff.altered, views_diff.altered)) {
|
||||
columns_changed.push_back(db.update_column_family(gs));
|
||||
}
|
||||
parallel_for_each(boost::range::join(tables_diff.dropped, views_diff.dropped), [&] (schema_diff::dropped_schema& dt) {
|
||||
auto& s = *dt.schema.get();
|
||||
return db.drop_column_family(s.ks_name(), s.cf_name(), [&] { return dt.jp.value(); });
|
||||
}).get();
|
||||
|
||||
auto& mm = service::get_local_migration_manager();
|
||||
auto it = columns_changed.begin();
|
||||
@@ -2681,12 +2661,22 @@ data_type parse_type(sstring str)
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> all_tables() {
|
||||
// Don't forget to update this list when new schema tables are added.
|
||||
// The listed schema tables are the ones synchronized between nodes,
|
||||
// and forgetting one of them in this list can cause bugs like #4339.
|
||||
return {
|
||||
keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
|
||||
views(), indexes(), types(), functions(), aggregates(), view_virtual_columns()
|
||||
};
|
||||
}
|
||||
|
||||
const std::vector<sstring>& all_table_names() {
|
||||
static thread_local std::vector<sstring> all =
|
||||
boost::copy_range<std::vector<sstring>>(all_tables() |
|
||||
boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
|
||||
return all;
|
||||
}
|
||||
|
||||
namespace legacy {
|
||||
|
||||
table_schema_version schema_mutations::digest() const {
|
||||
|
||||
@@ -127,9 +127,8 @@ using namespace v3;
|
||||
// Replication of schema between nodes with different version is inhibited.
|
||||
extern const sstring version;
|
||||
|
||||
extern std::vector<const char*> ALL;
|
||||
|
||||
std::vector<schema_ptr> all_tables();
|
||||
const std::vector<sstring>& all_table_names();
|
||||
|
||||
// saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
|
||||
future<> save_system_schema(const sstring & ks);
|
||||
|
||||
329
db/size_estimates_virtual_reader.cc
Normal file
329
db/size_estimates_virtual_reader.cc
Normal file
@@ -0,0 +1,329 @@
|
||||
/*
|
||||
* Copyright (C) 2019 ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <boost/range/adaptor/indirected.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <boost/range/algorithm/find_if.hpp>
|
||||
|
||||
#include "clustering_bounds_comparator.hh"
|
||||
#include "database.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include "partition_range_compat.hh"
|
||||
#include "range.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "stdx.hh"
|
||||
#include "mutation_fragment.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
#include "database.hh"
|
||||
|
||||
#include "db/size_estimates_virtual_reader.hh"
|
||||
|
||||
namespace db {
|
||||
|
||||
namespace size_estimates {
|
||||
|
||||
struct virtual_row {
|
||||
const bytes& cf_name;
|
||||
const token_range& tokens;
|
||||
clustering_key_prefix as_key() const {
|
||||
return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
|
||||
}
|
||||
};
|
||||
|
||||
struct virtual_row_comparator {
|
||||
schema_ptr _schema;
|
||||
virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
|
||||
bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
|
||||
return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
|
||||
}
|
||||
bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
|
||||
return operator()(row.as_key(), key);
|
||||
}
|
||||
bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
|
||||
return operator()(key, row.as_key());
|
||||
}
|
||||
};
|
||||
|
||||
// Iterating over the cartesian product of cf_names and token_ranges.
|
||||
class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
|
||||
std::reference_wrapper<const std::vector<bytes>> _cf_names;
|
||||
std::reference_wrapper<const std::vector<token_range>> _ranges;
|
||||
size_t _cf_names_idx = 0;
|
||||
size_t _ranges_idx = 0;
|
||||
public:
|
||||
struct end_iterator_tag {};
|
||||
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
|
||||
: _cf_names(std::ref(cf_names))
|
||||
, _ranges(std::ref(ranges))
|
||||
{ }
|
||||
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
|
||||
: _cf_names(std::ref(cf_names))
|
||||
, _ranges(std::ref(ranges))
|
||||
, _cf_names_idx(cf_names.size())
|
||||
, _ranges_idx(ranges.size())
|
||||
{
|
||||
if (cf_names.empty() || ranges.empty()) {
|
||||
// The product of an empty range with any range is an empty range.
|
||||
// In this case we want the end iterator to be equal to the begin iterator,
|
||||
// which has_ranges_idx = _cf_names_idx = 0.
|
||||
_ranges_idx = _cf_names_idx = 0;
|
||||
}
|
||||
}
|
||||
virtual_row_iterator& operator++() {
|
||||
if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
|
||||
_ranges_idx = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
virtual_row_iterator operator++(int) {
|
||||
virtual_row_iterator i(*this);
|
||||
++(*this);
|
||||
return i;
|
||||
}
|
||||
const value_type operator*() const {
|
||||
return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
|
||||
}
|
||||
bool operator==(const virtual_row_iterator& i) const {
|
||||
return _cf_names_idx == i._cf_names_idx
|
||||
&& _ranges_idx == i._ranges_idx;
|
||||
}
|
||||
bool operator!=(const virtual_row_iterator& i) const {
|
||||
return !(*this == i);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the keyspaces, ordered by name, as selected by the partition_range.
|
||||
*/
|
||||
static std::vector<sstring> get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
|
||||
struct keyspace_less_comparator {
|
||||
const schema& _s;
|
||||
keyspace_less_comparator(const schema& s) : _s(s) { }
|
||||
dht::ring_position as_ring_position(const sstring& ks) {
|
||||
auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
|
||||
return dht::global_partitioner().decorate_key(_s, std::move(pkey));
|
||||
}
|
||||
bool operator()(const sstring& ks1, const sstring& ks2) {
|
||||
return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
|
||||
}
|
||||
bool operator()(const sstring& ks, const dht::ring_position& rp) {
|
||||
return as_ring_position(ks).less_compare(_s, rp);
|
||||
}
|
||||
bool operator()(const dht::ring_position& rp, const sstring& ks) {
|
||||
return rp.less_compare(_s, as_ring_position(ks));
|
||||
}
|
||||
};
|
||||
auto keyspaces = db.get_non_system_keyspaces();
|
||||
auto cmp = keyspace_less_comparator(s);
|
||||
boost::sort(keyspaces, cmp);
|
||||
return boost::copy_range<std::vector<sstring>>(
|
||||
range.slice(keyspaces, std::move(cmp)) | boost::adaptors::filtered([&s] (const auto& ks) {
|
||||
// If this is a range query, results are divided between shards by the partition key (keyspace_name).
|
||||
return shard_of(dht::global_partitioner().get_token(s,
|
||||
partition_key::from_single_value(s, utf8_type->decompose(ks))))
|
||||
== engine().cpu_id();
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
|
||||
*/
|
||||
static dht::partition_range as_ring_position_range(dht::token_range& r) {
|
||||
stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
|
||||
if (r.start()) {
|
||||
start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
|
||||
}
|
||||
if (r.end()) {
|
||||
end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
|
||||
}
|
||||
return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
|
||||
*/
|
||||
static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
|
||||
int64_t count{0};
|
||||
utils::estimated_histogram hist{0};
|
||||
auto from_bytes = [] (auto& b) {
|
||||
return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
|
||||
};
|
||||
dht::token_range_vector ranges;
|
||||
::compat::unwrap_into(
|
||||
wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
|
||||
dht::token_comparator(),
|
||||
[&] (auto&& rng) { ranges.push_back(std::move(rng)); });
|
||||
for (auto&& r : ranges) {
|
||||
auto rp_range = as_ring_position_range(r);
|
||||
for (auto&& sstable : cf.select_sstables(rp_range)) {
|
||||
count += sstable->estimated_keys_for_range(r);
|
||||
hist.merge(sstable->get_stats_metadata().estimated_row_size);
|
||||
}
|
||||
}
|
||||
return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
|
||||
}
|
||||
|
||||
future<std::vector<token_range>> get_local_ranges() {
|
||||
auto& ss = service::get_local_storage_service();
|
||||
return ss.get_local_tokens().then([&ss] (auto&& tokens) {
|
||||
auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
|
||||
std::vector<token_range> local_ranges;
|
||||
auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
|
||||
assert(b);
|
||||
return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
|
||||
};
|
||||
// We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
|
||||
// All queries will be on that table, where all entries are text and there's no notion of
|
||||
// token ranges form the CQL point of view.
|
||||
auto left_inf = boost::find_if(ranges, [] (auto&& r) {
|
||||
return !r.start() || r.start()->value() == dht::minimum_token();
|
||||
});
|
||||
auto right_inf = boost::find_if(ranges, [] (auto&& r) {
|
||||
return !r.end() || r.start()->value() == dht::maximum_token();
|
||||
});
|
||||
if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
|
||||
local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
|
||||
ranges.erase(left_inf);
|
||||
ranges.erase(right_inf);
|
||||
}
|
||||
for (auto&& r : ranges) {
|
||||
local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
|
||||
}
|
||||
boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
|
||||
return utf8_type->less(tr1.start, tr2.start);
|
||||
});
|
||||
return local_ranges;
|
||||
});
|
||||
}
|
||||
|
||||
size_estimates_mutation_reader::size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
|
||||
: impl(schema)
|
||||
, _schema(std::move(schema))
|
||||
, _prange(&prange)
|
||||
, _slice(slice)
|
||||
, _fwd(fwd)
|
||||
{ }
|
||||
|
||||
future<> size_estimates_mutation_reader::get_next_partition() {
|
||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||
if (!_keyspaces) {
|
||||
_keyspaces = get_keyspaces(*_schema, db, *_prange);
|
||||
_current_partition = _keyspaces->begin();
|
||||
}
|
||||
if (_current_partition == _keyspaces->end()) {
|
||||
_end_of_stream = true;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return get_local_ranges().then([&db, this] (auto&& ranges) {
|
||||
auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
|
||||
auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
|
||||
++_current_partition;
|
||||
std::vector<mutation> ms;
|
||||
ms.emplace_back(std::move(mutations));
|
||||
_partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
|
||||
});
|
||||
}
|
||||
|
||||
future<> size_estimates_mutation_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
|
||||
if (!_partition_reader) {
|
||||
return get_next_partition();
|
||||
}
|
||||
return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
|
||||
push_mutation_fragment(std::move(mf));
|
||||
return stop_iteration(is_buffer_full());
|
||||
}, timeout).then([this] {
|
||||
if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
|
||||
_partition_reader = stdx::nullopt;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void size_estimates_mutation_reader::next_partition() {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
_partition_reader = stdx::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
|
||||
clear_buffer();
|
||||
_prange = ≺
|
||||
_keyspaces = stdx::nullopt;
|
||||
_partition_reader = stdx::nullopt;
|
||||
_end_of_stream = false;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> size_estimates_mutation_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
|
||||
forward_buffer_to(pr.start());
|
||||
_end_of_stream = false;
|
||||
if (_partition_reader) {
|
||||
return _partition_reader->fast_forward_to(std::move(pr), timeout);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
size_t size_estimates_mutation_reader::buffer_size() const {
|
||||
if (_partition_reader) {
|
||||
return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
|
||||
}
|
||||
return flat_mutation_reader::impl::buffer_size();
|
||||
}
|
||||
|
||||
std::vector<db::system_keyspace::range_estimates>
|
||||
size_estimates_mutation_reader::estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
|
||||
// For each specified range, estimate (crudely) mean partition size and partitions count.
|
||||
auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
|
||||
auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
|
||||
auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
|
||||
return utf8_type->decompose(cf.first);
|
||||
}));
|
||||
boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
|
||||
return utf8_type->less(n1, n2);
|
||||
});
|
||||
std::vector<db::system_keyspace::range_estimates> estimates;
|
||||
for (auto& range : _slice.row_ranges(*_schema, pkey)) {
|
||||
auto rows = boost::make_iterator_range(
|
||||
virtual_row_iterator(cf_names, local_ranges),
|
||||
virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
|
||||
auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
|
||||
for (auto&& r : rows_to_estimate) {
|
||||
auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
|
||||
estimates.push_back(estimate(cf, r.tokens));
|
||||
if (estimates.size() >= _slice.partition_row_limit()) {
|
||||
return estimates;
|
||||
}
|
||||
}
|
||||
}
|
||||
return estimates;
|
||||
}
|
||||
|
||||
} // namespace size_estimates
|
||||
|
||||
} // namespace db
|
||||
@@ -21,33 +21,19 @@
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <boost/range/adaptor/indirected.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <boost/range/algorithm/find_if.hpp>
|
||||
|
||||
#include "clustering_bounds_comparator.hh"
|
||||
#include "database.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include "mutation_reader.hh"
|
||||
#include "partition_range_compat.hh"
|
||||
#include "range.hh"
|
||||
#include "service/storage_service.hh"
|
||||
#include "stdx.hh"
|
||||
#include "mutation_fragment.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
|
||||
namespace db {
|
||||
|
||||
namespace size_estimates {
|
||||
|
||||
struct token_range {
|
||||
bytes start;
|
||||
bytes end;
|
||||
};
|
||||
|
||||
class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
|
||||
struct token_range {
|
||||
bytes start;
|
||||
bytes end;
|
||||
};
|
||||
schema_ptr _schema;
|
||||
const dht::partition_range* _prange;
|
||||
const query::partition_slice& _slice;
|
||||
@@ -57,267 +43,18 @@ class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
|
||||
streamed_mutation::forwarding _fwd;
|
||||
flat_mutation_reader_opt _partition_reader;
|
||||
public:
|
||||
size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
|
||||
: impl(schema)
|
||||
, _schema(std::move(schema))
|
||||
, _prange(&prange)
|
||||
, _slice(slice)
|
||||
, _fwd(fwd)
|
||||
{ }
|
||||
size_estimates_mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, streamed_mutation::forwarding);
|
||||
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point) override;
|
||||
virtual void next_partition() override;
|
||||
virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override;
|
||||
virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override;
|
||||
virtual size_t buffer_size() const override;
|
||||
private:
|
||||
future<> get_next_partition() {
|
||||
// For each specified range, estimate (crudely) mean partition size and partitions count.
|
||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||
if (!_keyspaces) {
|
||||
_keyspaces = get_keyspaces(*_schema, db, *_prange);
|
||||
_current_partition = _keyspaces->begin();
|
||||
}
|
||||
if (_current_partition == _keyspaces->end()) {
|
||||
_end_of_stream = true;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return get_local_ranges().then([&db, this] (auto&& ranges) {
|
||||
auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
|
||||
auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
|
||||
++_current_partition;
|
||||
std::vector<mutation> ms;
|
||||
ms.emplace_back(std::move(mutations));
|
||||
_partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
|
||||
});
|
||||
}
|
||||
public:
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
|
||||
if (!_partition_reader) {
|
||||
return get_next_partition();
|
||||
}
|
||||
return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
|
||||
push_mutation_fragment(std::move(mf));
|
||||
return stop_iteration(is_buffer_full());
|
||||
}, timeout).then([this] {
|
||||
if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
|
||||
_partition_reader = stdx::nullopt;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
virtual void next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
_partition_reader = stdx::nullopt;
|
||||
}
|
||||
}
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
||||
clear_buffer();
|
||||
_prange = ≺
|
||||
_keyspaces = stdx::nullopt;
|
||||
_partition_reader = stdx::nullopt;
|
||||
_end_of_stream = false;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
|
||||
forward_buffer_to(pr.start());
|
||||
_end_of_stream = false;
|
||||
if (_partition_reader) {
|
||||
return _partition_reader->fast_forward_to(std::move(pr), timeout);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
virtual size_t buffer_size() const override {
|
||||
if (_partition_reader) {
|
||||
return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
|
||||
}
|
||||
return flat_mutation_reader::impl::buffer_size();
|
||||
}
|
||||
/**
|
||||
* Returns the primary ranges for the local node.
|
||||
* Used for testing as well.
|
||||
*/
|
||||
static future<std::vector<token_range>> get_local_ranges() {
|
||||
auto& ss = service::get_local_storage_service();
|
||||
return ss.get_local_tokens().then([&ss] (auto&& tokens) {
|
||||
auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
|
||||
std::vector<token_range> local_ranges;
|
||||
auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
|
||||
assert(b);
|
||||
return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
|
||||
};
|
||||
// We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
|
||||
// All queries will be on that table, where all entries are text and there's no notion of
|
||||
// token ranges form the CQL point of view.
|
||||
auto left_inf = boost::find_if(ranges, [] (auto&& r) {
|
||||
return !r.start() || r.start()->value() == dht::minimum_token();
|
||||
});
|
||||
auto right_inf = boost::find_if(ranges, [] (auto&& r) {
|
||||
return !r.end() || r.start()->value() == dht::maximum_token();
|
||||
});
|
||||
if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
|
||||
local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
|
||||
ranges.erase(left_inf);
|
||||
ranges.erase(right_inf);
|
||||
}
|
||||
for (auto&& r : ranges) {
|
||||
local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
|
||||
}
|
||||
boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
|
||||
return utf8_type->less(tr1.start, tr2.start);
|
||||
});
|
||||
return local_ranges;
|
||||
});
|
||||
}
|
||||
private:
|
||||
struct virtual_row {
|
||||
const bytes& cf_name;
|
||||
const token_range& tokens;
|
||||
clustering_key_prefix as_key() const {
|
||||
return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
|
||||
}
|
||||
};
|
||||
struct virtual_row_comparator {
|
||||
schema_ptr _schema;
|
||||
virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
|
||||
bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
|
||||
return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
|
||||
}
|
||||
bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
|
||||
return operator()(row.as_key(), key);
|
||||
}
|
||||
bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
|
||||
return operator()(key, row.as_key());
|
||||
}
|
||||
};
|
||||
class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
|
||||
std::reference_wrapper<const std::vector<bytes>> _cf_names;
|
||||
std::reference_wrapper<const std::vector<token_range>> _ranges;
|
||||
size_t _cf_names_idx = 0;
|
||||
size_t _ranges_idx = 0;
|
||||
public:
|
||||
struct end_iterator_tag {};
|
||||
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
|
||||
: _cf_names(std::ref(cf_names))
|
||||
, _ranges(std::ref(ranges))
|
||||
{ }
|
||||
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
|
||||
: _cf_names(std::ref(cf_names))
|
||||
, _ranges(std::ref(ranges))
|
||||
, _cf_names_idx(cf_names.size())
|
||||
, _ranges_idx(ranges.size())
|
||||
{ }
|
||||
virtual_row_iterator& operator++() {
|
||||
if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
|
||||
_ranges_idx = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
virtual_row_iterator operator++(int) {
|
||||
virtual_row_iterator i(*this);
|
||||
++(*this);
|
||||
return i;
|
||||
}
|
||||
const value_type operator*() const {
|
||||
return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
|
||||
}
|
||||
bool operator==(const virtual_row_iterator& i) const {
|
||||
return _cf_names_idx == i._cf_names_idx
|
||||
&& _ranges_idx == i._ranges_idx;
|
||||
}
|
||||
bool operator!=(const virtual_row_iterator& i) const {
|
||||
return !(*this == i);
|
||||
}
|
||||
};
|
||||
future<> get_next_partition();
|
||||
|
||||
std::vector<db::system_keyspace::range_estimates>
|
||||
estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
|
||||
auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
|
||||
auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
|
||||
auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
|
||||
return utf8_type->decompose(cf.first);
|
||||
}));
|
||||
boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
|
||||
return utf8_type->less(n1, n2);
|
||||
});
|
||||
std::vector<db::system_keyspace::range_estimates> estimates;
|
||||
for (auto& range : _slice.row_ranges(*_schema, pkey)) {
|
||||
auto rows = boost::make_iterator_range(
|
||||
virtual_row_iterator(cf_names, local_ranges),
|
||||
virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
|
||||
auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
|
||||
for (auto&& r : rows_to_estimate) {
|
||||
auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
|
||||
estimates.push_back(estimate(cf, r.tokens));
|
||||
if (estimates.size() >= _slice.partition_row_limit()) {
|
||||
return estimates;
|
||||
}
|
||||
}
|
||||
}
|
||||
return estimates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the keyspaces, ordered by name, as selected by the partition_range.
|
||||
*/
|
||||
static ks_range get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
|
||||
struct keyspace_less_comparator {
|
||||
const schema& _s;
|
||||
keyspace_less_comparator(const schema& s) : _s(s) { }
|
||||
dht::ring_position as_ring_position(const sstring& ks) {
|
||||
auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
|
||||
return dht::global_partitioner().decorate_key(_s, std::move(pkey));
|
||||
}
|
||||
bool operator()(const sstring& ks1, const sstring& ks2) {
|
||||
return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
|
||||
}
|
||||
bool operator()(const sstring& ks, const dht::ring_position& rp) {
|
||||
return as_ring_position(ks).less_compare(_s, rp);
|
||||
}
|
||||
bool operator()(const dht::ring_position& rp, const sstring& ks) {
|
||||
return rp.less_compare(_s, as_ring_position(ks));
|
||||
}
|
||||
};
|
||||
auto keyspaces = db.get_non_system_keyspaces();
|
||||
auto cmp = keyspace_less_comparator(s);
|
||||
boost::sort(keyspaces, cmp);
|
||||
return boost::copy_range<ks_range>(range.slice(keyspaces, std::move(cmp)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
|
||||
*/
|
||||
static dht::partition_range as_ring_position_range(dht::token_range& r) {
|
||||
stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
|
||||
if (r.start()) {
|
||||
start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
|
||||
}
|
||||
if (r.end()) {
|
||||
end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
|
||||
}
|
||||
return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
|
||||
*/
|
||||
static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
|
||||
int64_t count{0};
|
||||
utils::estimated_histogram hist{0};
|
||||
auto from_bytes = [] (auto& b) {
|
||||
return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
|
||||
};
|
||||
dht::token_range_vector ranges;
|
||||
::compat::unwrap_into(
|
||||
wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
|
||||
dht::token_comparator(),
|
||||
[&] (auto&& rng) { ranges.push_back(std::move(rng)); });
|
||||
for (auto&& r : ranges) {
|
||||
auto rp_range = as_ring_position_range(r);
|
||||
for (auto&& sstable : cf.select_sstables(rp_range)) {
|
||||
count += sstable->estimated_keys_for_range(r);
|
||||
hist.merge(sstable->get_stats_metadata().estimated_row_size);
|
||||
}
|
||||
}
|
||||
return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
|
||||
}
|
||||
estimates_for_current_keyspace(const database&, std::vector<token_range> local_ranges) const;
|
||||
};
|
||||
|
||||
struct virtual_reader {
|
||||
@@ -332,6 +69,12 @@ struct virtual_reader {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the primary ranges for the local node.
|
||||
* Used for testing as well.
|
||||
*/
|
||||
future<std::vector<token_range>> get_local_ranges();
|
||||
|
||||
} // namespace size_estimates
|
||||
|
||||
} // namespace db
|
||||
|
||||
@@ -445,7 +445,7 @@ void create_virtual_column(schema_builder& builder, const bytes& name, const dat
|
||||
// A map has keys and values. We don't need these values,
|
||||
// and can use empty values instead.
|
||||
auto mtype = dynamic_pointer_cast<const map_type_impl>(type);
|
||||
builder.with_column(name, map_type_impl::get_instance(mtype->get_values_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
|
||||
builder.with_column(name, map_type_impl::get_instance(mtype->get_keys_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
|
||||
} else if (ctype->is_set()) {
|
||||
// A set's cell has nothing beyond the keys, so the
|
||||
// virtual version of a set is, unfortunately, a complete
|
||||
@@ -781,6 +781,7 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
// If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
|
||||
if (_update && !_update->is_end_of_partition()) {
|
||||
if (_update->is_clustering_row()) {
|
||||
apply_tracked_tombstones(_update_tombstone_tracker, _update->as_mutable_clustering_row());
|
||||
generate_update(std::move(*_update).as_clustering_row(), { });
|
||||
}
|
||||
return advance_updates();
|
||||
@@ -1464,7 +1465,16 @@ private:
|
||||
built_views _built_views;
|
||||
std::vector<view_ptr> _views_to_build;
|
||||
std::deque<mutation_fragment> _fragments;
|
||||
|
||||
// The compact_for_query<> that feeds this consumer is already configured
|
||||
// to feed us up to view_builder::batchsize (128) rows and not an entire
|
||||
// partition. Still, if rows contain large blobs, saving 128 of them in
|
||||
// _fragments may be too much. So we want to track _fragment's memory
|
||||
// usage, and flush the _fragments if it has grown too large.
|
||||
// Additionally, limiting _fragment's size also solves issue #4213:
|
||||
// A single view mutation can be as large as the size of the base rows
|
||||
// used to build it, and we cannot allow its serialized size to grow
|
||||
// beyond our limit on mutation size (by default 32 MB).
|
||||
size_t _fragments_memory_usage = 0;
|
||||
public:
|
||||
consumer(view_builder& builder, build_step& step)
|
||||
: _builder(builder)
|
||||
@@ -1527,7 +1537,15 @@ public:
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
_fragments_memory_usage += cr.memory_usage(*_step.base->schema());
|
||||
_fragments.push_back(std::move(cr));
|
||||
if (_fragments_memory_usage > 1024*1024) {
|
||||
// Although we have not yet completed the batch of base rows that
|
||||
// compact_for_query<> planned for us (view_builder::batchsize),
|
||||
// we've still collected enough rows to reach sizeable memory use,
|
||||
// so let's flush these rows now.
|
||||
flush_fragments();
|
||||
}
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
@@ -1535,7 +1553,7 @@ public:
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume_end_of_partition() {
|
||||
void flush_fragments() {
|
||||
_builder._as.check();
|
||||
if (!_fragments.empty()) {
|
||||
_fragments.push_front(partition_start(_step.current_key, tombstone()));
|
||||
@@ -1544,7 +1562,12 @@ public:
|
||||
_step.current_token(),
|
||||
make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
|
||||
_fragments.clear();
|
||||
_fragments_memory_usage = 0;
|
||||
}
|
||||
}
|
||||
|
||||
stop_iteration consume_end_of_partition() {
|
||||
flush_fragments();
|
||||
return stop_iteration(_step.build_status.empty());
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,9 @@
|
||||
namespace db::view {
|
||||
|
||||
future<> view_update_from_staging_generator::start() {
|
||||
_started = seastar::async([this]() mutable {
|
||||
thread_attributes attr;
|
||||
attr.sched_group = _db.get_streaming_scheduling_group();
|
||||
_started = seastar::async(std::move(attr), [this]() mutable {
|
||||
while (!_as.abort_requested()) {
|
||||
if (_sstables_with_tables.empty()) {
|
||||
_pending_sstables.wait().get();
|
||||
|
||||
@@ -51,20 +51,22 @@ future<> boot_strapper::bootstrap() {
|
||||
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
|
||||
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
|
||||
for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
|
||||
auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
|
||||
return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
|
||||
auto& ks = _db.local().find_keyspace(keyspace_name);
|
||||
auto& strategy = ks.get_replication_strategy();
|
||||
dht::token_range_vector ranges = strategy.get_pending_address_ranges(_token_metadata, _tokens, _address);
|
||||
blogger.debug("Will stream keyspace={}, ranges={}", keyspace_name, ranges);
|
||||
streamer->add_ranges(keyspace_name, ranges);
|
||||
}
|
||||
|
||||
return streamer->stream_async().then([streamer] () {
|
||||
service::get_local_storage_service().finish_bootstrapping();
|
||||
}).handle_exception([streamer] (std::exception_ptr eptr) {
|
||||
blogger.warn("Error during bootstrap: {}", eptr);
|
||||
return make_exception_future<>(std::move(eptr));
|
||||
return streamer->add_ranges(keyspace_name, ranges);
|
||||
}).then([this, streamer] {
|
||||
return streamer->stream_async().then([streamer] () {
|
||||
service::get_local_storage_service().finish_bootstrapping();
|
||||
}).handle_exception([streamer] (std::exception_ptr eptr) {
|
||||
blogger.warn("Error during bootstrap: {}", eptr);
|
||||
return make_exception_future<>(std::move(eptr));
|
||||
});
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata metadata, database& db) {
|
||||
|
||||
@@ -114,6 +114,9 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, dh
|
||||
for (auto& desired_range : desired_ranges) {
|
||||
auto found = false;
|
||||
for (auto& x : range_addresses) {
|
||||
if (need_preempt()) {
|
||||
seastar::thread::yield();
|
||||
}
|
||||
const range<token>& src_range = x.first;
|
||||
if (src_range.contains(desired_range, dht::tri_compare)) {
|
||||
std::vector<inet_address>& addresses = x.second;
|
||||
@@ -157,6 +160,9 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
|
||||
for (auto& desired_range : desired_ranges) {
|
||||
for (auto& x : range_addresses) {
|
||||
const range<token>& src_range = x.first;
|
||||
if (need_preempt()) {
|
||||
seastar::thread::yield();
|
||||
}
|
||||
if (src_range.contains(desired_range, dht::tri_compare)) {
|
||||
std::vector<inet_address> old_endpoints(x.second.begin(), x.second.end());
|
||||
auto it = pending_range_addresses.find(desired_range);
|
||||
@@ -226,7 +232,8 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
|
||||
}
|
||||
|
||||
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
|
||||
void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
|
||||
future<> range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
|
||||
return seastar::async([this, keyspace_name, ranges= std::move(ranges)] () mutable {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
@@ -249,6 +256,7 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
|
||||
}
|
||||
}
|
||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
});
|
||||
}
|
||||
|
||||
future<> range_streamer::stream_async() {
|
||||
|
||||
@@ -120,7 +120,7 @@ public:
|
||||
_source_filters.emplace(std::move(filter));
|
||||
}
|
||||
|
||||
void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
|
||||
future<> add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
|
||||
void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
|
||||
void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
|
||||
private:
|
||||
|
||||
2
dist/ami/scylla.json
vendored
2
dist/ami/scylla.json
vendored
@@ -68,7 +68,7 @@
|
||||
"type": "shell",
|
||||
"inline": [
|
||||
"sudo yum install -y epel-release",
|
||||
"sudo yum install -y python34",
|
||||
"sudo yum install -y python36",
|
||||
"sudo /home/{{user `ssh_username`}}/scylla_install_ami {{ user `install_args` }}"
|
||||
]
|
||||
}
|
||||
|
||||
2
dist/common/scripts/scylla_util.py
vendored
2
dist/common/scripts/scylla_util.py
vendored
@@ -449,6 +449,8 @@ def create_perftune_conf(nic='eth0'):
|
||||
|
||||
|
||||
def is_valid_nic(nic):
|
||||
if len(nic) == 0:
|
||||
return False
|
||||
return os.path.exists('/sys/class/net/{}'.format(nic))
|
||||
|
||||
# Remove this when we do not support SET_NIC configuration value anymore
|
||||
|
||||
2
dist/docker/redhat/Dockerfile
vendored
2
dist/docker/redhat/Dockerfile
vendored
@@ -33,7 +33,7 @@ RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.re
|
||||
yum -y remove boost-thread boost-system && \
|
||||
yum -y install scylla hostname supervisor && \
|
||||
yum clean all && \
|
||||
yum -y install python34 python34-PyYAML && \
|
||||
yum -y install python36 python36-PyYAML && \
|
||||
cat /scylla_bashrc >> /etc/bashrc && \
|
||||
mkdir -p /etc/supervisor.conf.d && \
|
||||
mkdir -p /var/log/scylla && \
|
||||
|
||||
6
dist/redhat/scylla.spec.mustache
vendored
6
dist/redhat/scylla.spec.mustache
vendored
@@ -56,9 +56,9 @@ License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
|
||||
%{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python36 scylla-gcc73-c++, scylla-python36-pyparsing20 yaml-cpp-static pystache python-setuptools}
|
||||
Requires: {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
|
||||
%{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
|
||||
%{?rhel:Requires: python36 python36-PyYAML kernel >= 3.10.0-514}
|
||||
%{?fedora:Requires: python3 python3-PyYAML}
|
||||
Conflicts: abrt
|
||||
%ifarch x86_64
|
||||
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
|
||||
%endif
|
||||
%if 0%{?rhel}
|
||||
. /etc/profile.d/scylla.sh
|
||||
python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||
python3.6 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.6 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||
%endif
|
||||
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
||||
|
||||
|
||||
@@ -22,6 +22,8 @@
|
||||
#include "flat_mutation_reader.hh"
|
||||
#include "mutation_reader.hh"
|
||||
#include "seastar/util/reference_wrapper.hh"
|
||||
#include "clustering_ranges_walker.hh"
|
||||
#include "schema_upgrader.hh"
|
||||
#include <algorithm>
|
||||
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
@@ -347,6 +349,7 @@ flat_mutation_reader make_empty_flat_reader(schema_ptr s) {
|
||||
|
||||
flat_mutation_reader
|
||||
flat_mutation_reader_from_mutations(std::vector<mutation> ms,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
streamed_mutation::forwarding fwd) {
|
||||
std::vector<mutation> sliced_ms;
|
||||
@@ -355,7 +358,12 @@ flat_mutation_reader_from_mutations(std::vector<mutation> ms,
|
||||
auto mp = mutation_partition(std::move(m.partition()), *m.schema(), std::move(ck_ranges));
|
||||
sliced_ms.emplace_back(m.schema(), m.decorated_key(), std::move(mp));
|
||||
}
|
||||
return flat_mutation_reader_from_mutations(sliced_ms, query::full_partition_range, fwd);
|
||||
return flat_mutation_reader_from_mutations(sliced_ms, pr, fwd);
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
flat_mutation_reader_from_mutations(std::vector<mutation> ms, const query::partition_slice& slice, streamed_mutation::forwarding fwd) {
|
||||
return flat_mutation_reader_from_mutations(std::move(ms), query::full_partition_range, slice, fwd);
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
@@ -487,11 +495,11 @@ flat_mutation_reader_from_mutations(std::vector<mutation> mutations, const dht::
|
||||
}
|
||||
public:
|
||||
reader(schema_ptr s, std::vector<mutation>&& mutations, const dht::partition_range& pr)
|
||||
: impl(std::move(s))
|
||||
: impl(s)
|
||||
, _mutations(std::move(mutations))
|
||||
, _cur(find_first_partition(_mutations, pr))
|
||||
, _end(find_last_partition(_mutations, pr))
|
||||
, _cmp(*_cur->schema())
|
||||
, _cmp(*s)
|
||||
{
|
||||
_end_of_stream = _cur == _end;
|
||||
if (!_end_of_stream) {
|
||||
@@ -509,6 +517,7 @@ flat_mutation_reader_from_mutations(std::vector<mutation> mutations, const dht::
|
||||
// clear_and_dispose() used by mutation_partition destructor won't
|
||||
// work properly.
|
||||
|
||||
_cur = _mutations.begin();
|
||||
while (_cur != _end) {
|
||||
destroy_current_mutation();
|
||||
++_cur;
|
||||
@@ -779,15 +788,32 @@ make_flat_multi_range_reader(
|
||||
|
||||
flat_mutation_reader
|
||||
make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_fragment> fragments) {
|
||||
return make_flat_mutation_reader_from_fragments(std::move(schema), std::move(fragments), query::full_partition_range);
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_fragment> fragments, const dht::partition_range& pr) {
|
||||
class reader : public flat_mutation_reader::impl {
|
||||
std::deque<mutation_fragment> _fragments;
|
||||
const dht::partition_range* _pr;
|
||||
dht::ring_position_comparator _cmp;
|
||||
|
||||
private:
|
||||
bool end_of_range() const {
|
||||
return _fragments.empty() ||
|
||||
(_fragments.front().is_partition_start() && _pr->after(_fragments.front().as_partition_start().key(), _cmp));
|
||||
}
|
||||
|
||||
public:
|
||||
reader(schema_ptr schema, std::deque<mutation_fragment> fragments)
|
||||
reader(schema_ptr schema, std::deque<mutation_fragment> fragments, const dht::partition_range& pr)
|
||||
: flat_mutation_reader::impl(std::move(schema))
|
||||
, _fragments(std::move(fragments)) {
|
||||
, _fragments(std::move(fragments))
|
||||
, _pr(&pr)
|
||||
, _cmp(*_schema) {
|
||||
fast_forward_to(*_pr, db::no_timeout);
|
||||
}
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point) override {
|
||||
while (!(_end_of_stream = _fragments.empty()) && !is_buffer_full()) {
|
||||
while (!(_end_of_stream = end_of_range()) && !is_buffer_full()) {
|
||||
push_mutation_fragment(std::move(_fragments.front()));
|
||||
_fragments.pop_front();
|
||||
}
|
||||
@@ -796,7 +822,7 @@ make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_
|
||||
virtual void next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
while (!(_end_of_stream = _fragments.empty()) && !_fragments.front().is_partition_start()) {
|
||||
while (!(_end_of_stream = end_of_range()) && !_fragments.front().is_partition_start()) {
|
||||
_fragments.pop_front();
|
||||
}
|
||||
}
|
||||
@@ -805,8 +831,48 @@ make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_
|
||||
throw std::runtime_error("This reader can't be fast forwarded to another range.");
|
||||
}
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
||||
throw std::runtime_error("This reader can't be fast forwarded to another position.");
|
||||
clear_buffer();
|
||||
_pr = ≺
|
||||
_fragments.erase(_fragments.begin(), std::find_if(_fragments.begin(), _fragments.end(), [this] (const mutation_fragment& mf) {
|
||||
return mf.is_partition_start() && !_pr->before(mf.as_partition_start().key(), _cmp);
|
||||
}));
|
||||
_end_of_stream = end_of_range();
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
return make_flat_mutation_reader<reader>(std::move(schema), std::move(fragments));
|
||||
return make_flat_mutation_reader<reader>(std::move(schema), std::move(fragments), pr);
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
make_flat_mutation_reader_from_fragments(schema_ptr schema, std::deque<mutation_fragment> fragments, const dht::partition_range& pr, const query::partition_slice& slice) {
|
||||
std::optional<clustering_ranges_walker> ranges_walker;
|
||||
for (auto it = fragments.begin(); it != fragments.end();) {
|
||||
switch (it->mutation_fragment_kind()) {
|
||||
case mutation_fragment::kind::partition_start:
|
||||
ranges_walker.emplace(*schema, slice.row_ranges(*schema, it->as_partition_start().key().key()), false);
|
||||
case mutation_fragment::kind::static_row: // fall-through
|
||||
case mutation_fragment::kind::partition_end: // fall-through
|
||||
++it;
|
||||
break;
|
||||
case mutation_fragment::kind::clustering_row:
|
||||
if (ranges_walker->advance_to(it->position())) {
|
||||
++it;
|
||||
} else {
|
||||
it = fragments.erase(it);
|
||||
}
|
||||
break;
|
||||
case mutation_fragment::kind::range_tombstone:
|
||||
if (ranges_walker->advance_to(it->as_range_tombstone().position(), it->as_range_tombstone().end_position())) {
|
||||
++it;
|
||||
} else {
|
||||
it = fragments.erase(it);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return make_flat_mutation_reader_from_fragments(std::move(schema), std::move(fragments), pr);
|
||||
}
|
||||
|
||||
void flat_mutation_reader::do_upgrade_schema(const schema_ptr& s) {
|
||||
*this = transform(std::move(*this), schema_upgrader(s));
|
||||
}
|
||||
|
||||
@@ -328,6 +328,7 @@ private:
|
||||
flat_mutation_reader() = default;
|
||||
explicit operator bool() const noexcept { return bool(_impl); }
|
||||
friend class optimized_optional<flat_mutation_reader>;
|
||||
void do_upgrade_schema(const schema_ptr&);
|
||||
public:
|
||||
// Documented in mutation_reader::forwarding in mutation_reader.hh.
|
||||
class partition_range_forwarding_tag;
|
||||
@@ -466,6 +467,14 @@ public:
|
||||
void move_buffer_content_to(impl& other) {
|
||||
_impl->move_buffer_content_to(other);
|
||||
}
|
||||
|
||||
// Causes this reader to conform to s.
|
||||
// Multiple calls of upgrade_schema() compose, effects of prior calls on the stream are preserved.
|
||||
void upgrade_schema(const schema_ptr& s) {
|
||||
if (__builtin_expect(s != schema(), false)) {
|
||||
do_upgrade_schema(s);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using flat_mutation_reader_opt = optimized_optional<flat_mutation_reader>;
|
||||
@@ -568,8 +577,12 @@ class delegating_reader : public flat_mutation_reader::impl {
|
||||
public:
|
||||
delegating_reader(Underlying&& r) : impl(to_reference(r).schema()), _underlying(std::forward<Underlying>(r)) { }
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
return fill_buffer_from(to_reference(_underlying), timeout).then([this] (bool underlying_finished) {
|
||||
_end_of_stream = underlying_finished;
|
||||
if (is_buffer_full()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return to_reference(_underlying).fill_buffer(timeout).then([this] {
|
||||
_end_of_stream = to_reference(_underlying).is_end_of_stream();
|
||||
to_reference(_underlying).move_buffer_content_to(*this);
|
||||
});
|
||||
}
|
||||
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
|
||||
@@ -609,6 +622,11 @@ flat_mutation_reader
|
||||
flat_mutation_reader_from_mutations(std::vector<mutation> ms,
|
||||
const query::partition_slice& slice,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
|
||||
flat_mutation_reader
|
||||
flat_mutation_reader_from_mutations(std::vector<mutation> ms,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no);
|
||||
|
||||
/// Make a reader that enables the wrapped reader to work with multiple ranges.
|
||||
///
|
||||
@@ -642,6 +660,12 @@ make_flat_multi_range_reader(
|
||||
flat_mutation_reader
|
||||
make_flat_mutation_reader_from_fragments(schema_ptr, std::deque<mutation_fragment>);
|
||||
|
||||
flat_mutation_reader
|
||||
make_flat_mutation_reader_from_fragments(schema_ptr, std::deque<mutation_fragment>, const dht::partition_range& pr);
|
||||
|
||||
flat_mutation_reader
|
||||
make_flat_mutation_reader_from_fragments(schema_ptr, std::deque<mutation_fragment>, const dht::partition_range& pr, const query::partition_slice& slice);
|
||||
|
||||
// Calls the consumer for each element of the reader's stream until end of stream
|
||||
// is reached or the consumer requests iteration to stop by returning stop_iteration::yes.
|
||||
// The consumer should accept mutation as the argument and return stop_iteration.
|
||||
|
||||
@@ -25,6 +25,8 @@
|
||||
|
||||
namespace gms {
|
||||
|
||||
class feature_service;
|
||||
|
||||
/**
|
||||
* A gossip feature tracks whether all the nodes the current one is
|
||||
* aware of support the specified feature.
|
||||
@@ -32,12 +34,13 @@ namespace gms {
|
||||
* A feature should only be created once the gossiper is available.
|
||||
*/
|
||||
class feature final {
|
||||
feature_service* _service = nullptr;
|
||||
sstring _name;
|
||||
bool _enabled = false;
|
||||
mutable shared_promise<> _pr;
|
||||
friend class gossiper;
|
||||
public:
|
||||
explicit feature(sstring name, bool enabled = false);
|
||||
explicit feature(feature_service& service, sstring name, bool enabled = false);
|
||||
feature() = default;
|
||||
~feature();
|
||||
feature(const feature& other) = delete;
|
||||
|
||||
50
gms/feature_service.hh
Normal file
50
gms/feature_service.hh
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "seastarx.hh"
|
||||
|
||||
namespace gms {
|
||||
|
||||
class feature;
|
||||
|
||||
/**
|
||||
* A gossip feature tracks whether all the nodes the current one is
|
||||
* aware of support the specified feature.
|
||||
*/
|
||||
class feature_service final {
|
||||
std::unordered_map<sstring, std::vector<feature*>> _registered_features;
|
||||
public:
|
||||
feature_service();
|
||||
~feature_service();
|
||||
future<> stop();
|
||||
void register_feature(feature* f);
|
||||
void unregister_feature(feature* f);
|
||||
void enable(const sstring& name);
|
||||
};
|
||||
|
||||
} // namespace gms
|
||||
195
gms/gossiper.cc
195
gms/gossiper.cc
@@ -44,6 +44,7 @@
|
||||
#include "gms/gossip_digest_ack2.hh"
|
||||
#include "gms/versioned_value.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include "gms/application_state.hh"
|
||||
#include "gms/failure_detector.hh"
|
||||
#include "gms/i_failure_detection_event_listener.hh"
|
||||
@@ -53,6 +54,7 @@
|
||||
#include "message/messaging_service.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include "log.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/core/thread.hh>
|
||||
#include <seastar/core/metrics.hh>
|
||||
@@ -126,7 +128,8 @@ public:
|
||||
void on_restart(inet_address, endpoint_state) override {}
|
||||
};
|
||||
|
||||
gossiper::gossiper() {
|
||||
gossiper::gossiper(feature_service& features)
|
||||
: _feature_service(features) {
|
||||
// Gossiper's stuff below runs only on CPU0
|
||||
if (engine().cpu_id() != 0) {
|
||||
return;
|
||||
@@ -480,8 +483,7 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
|
||||
int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
|
||||
int remote_generation = remote_state.get_heart_beat_state().get_generation();
|
||||
logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
|
||||
// A node was removed with nodetool removenode can have a generation of 2
|
||||
if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
|
||||
if (remote_generation > service::get_generation_number() + MAX_GENERATION_DIFFERENCE) {
|
||||
// assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
|
||||
logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
|
||||
ep, local_generation, remote_generation);
|
||||
@@ -2031,14 +2033,21 @@ future<> gossiper::wait_for_gossip(std::chrono::milliseconds initial_delay, stdx
|
||||
|
||||
future<> gossiper::wait_for_gossip_to_settle() {
|
||||
static constexpr std::chrono::milliseconds GOSSIP_SETTLE_MIN_WAIT_MS{5000};
|
||||
|
||||
auto& cfg = service::get_local_storage_service().db().local().get_config();
|
||||
auto force_after = cfg.skip_wait_for_gossip_to_settle();
|
||||
auto do_enable_features = [this] {
|
||||
return async([this] {
|
||||
if (!std::exchange(_gossip_settled, true)) {
|
||||
maybe_enable_features();
|
||||
}
|
||||
});
|
||||
};
|
||||
if (force_after == 0) {
|
||||
return make_ready_future<>();
|
||||
return do_enable_features();
|
||||
}
|
||||
logger.info("Waiting for gossip to settle before accepting client requests...");
|
||||
return wait_for_gossip(GOSSIP_SETTLE_MIN_WAIT_MS, force_after);
|
||||
return wait_for_gossip(GOSSIP_SETTLE_MIN_WAIT_MS, force_after).then([this, do_enable_features] {
|
||||
return do_enable_features();
|
||||
});
|
||||
}
|
||||
|
||||
future<> gossiper::wait_for_range_setup() {
|
||||
@@ -2084,20 +2093,45 @@ std::set<sstring> gossiper::get_supported_features(inet_address endpoint) const
|
||||
return to_feature_set(app_state->value);
|
||||
}
|
||||
|
||||
std::set<sstring> gossiper::get_supported_features() const {
|
||||
std::unordered_map<inet_address, std::set<sstring>> features_map;
|
||||
std::set<sstring> gossiper::get_supported_features(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const {
|
||||
std::unordered_map<gms::inet_address, std::set<sstring>> features_map;
|
||||
std::set<sstring> common_features;
|
||||
|
||||
for (auto& x : loaded_peer_features) {
|
||||
auto features = to_feature_set(x.second);
|
||||
if (features.empty()) {
|
||||
logger.warn("Loaded empty features for peer node {}", x.first);
|
||||
} else {
|
||||
features_map.emplace(x.first, std::move(features));
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& x : endpoint_state_map) {
|
||||
auto endpoint = x.first;
|
||||
auto features = get_supported_features(endpoint);
|
||||
if (ignore_local_node && endpoint == get_broadcast_address()) {
|
||||
logger.debug("Ignore SUPPORTED_FEATURES of local node: features={}", features);
|
||||
continue;
|
||||
}
|
||||
if (features.empty()) {
|
||||
return std::set<sstring>();
|
||||
auto it = loaded_peer_features.find(endpoint);
|
||||
if (it != loaded_peer_features.end()) {
|
||||
logger.info("Node {} does not contain SUPPORTED_FEATURES in gossip, using features saved in system table, features={}", endpoint, to_feature_set(it->second));
|
||||
} else {
|
||||
logger.warn("Node {} does not contain SUPPORTED_FEATURES in gossip or system table", endpoint);
|
||||
}
|
||||
} else {
|
||||
// Replace the features with live info
|
||||
features_map[endpoint] = std::move(features);
|
||||
}
|
||||
if (common_features.empty()) {
|
||||
common_features = features;
|
||||
}
|
||||
features_map.emplace(endpoint, std::move(features));
|
||||
}
|
||||
|
||||
if (ignore_local_node) {
|
||||
features_map.erase(get_broadcast_address());
|
||||
}
|
||||
|
||||
if (!features_map.empty()) {
|
||||
common_features = features_map.begin()->second;
|
||||
}
|
||||
|
||||
for (auto& x : features_map) {
|
||||
@@ -2112,37 +2146,10 @@ std::set<sstring> gossiper::get_supported_features() const {
|
||||
return common_features;
|
||||
}
|
||||
|
||||
std::set<sstring> gossiper::get_supported_features(std::unordered_map<gms::inet_address, sstring> peer_features_string) {
|
||||
std::set<sstring> common_features;
|
||||
// Convert feature string split by "," to std::set
|
||||
std::unordered_map<gms::inet_address, std::set<sstring>> features_map;
|
||||
for (auto& x : peer_features_string) {
|
||||
std::set<sstring> features = to_feature_set(x.second);
|
||||
if (features.empty()) {
|
||||
return std::set<sstring>();
|
||||
}
|
||||
if (common_features.empty()) {
|
||||
common_features = features;
|
||||
}
|
||||
features_map.emplace(x.first, features);
|
||||
}
|
||||
|
||||
for (auto& x : features_map) {
|
||||
auto& features = x.second;
|
||||
std::set<sstring> result;
|
||||
std::set_intersection(features.begin(), features.end(),
|
||||
common_features.begin(), common_features.end(),
|
||||
std::inserter(result, result.end()));
|
||||
common_features = std::move(result);
|
||||
}
|
||||
common_features.erase("");
|
||||
return common_features;
|
||||
}
|
||||
|
||||
void gossiper::check_knows_remote_features(sstring local_features_string) const {
|
||||
void gossiper::check_knows_remote_features(sstring local_features_string, const std::unordered_map<inet_address, sstring>& loaded_peer_features) const {
|
||||
std::set<sstring> local_features = to_feature_set(local_features_string);
|
||||
auto local_endpoint = get_broadcast_address();
|
||||
auto common_features = get_supported_features();
|
||||
auto common_features = get_supported_features(loaded_peer_features, ignore_features_of_local_node::yes);
|
||||
if (boost::range::includes(local_features, common_features)) {
|
||||
logger.info("Feature check passed. Local node {} features = {}, Remote common_features = {}",
|
||||
local_endpoint, local_features, common_features);
|
||||
@@ -2151,44 +2158,19 @@ void gossiper::check_knows_remote_features(sstring local_features_string) const
|
||||
}
|
||||
}
|
||||
|
||||
void gossiper::check_knows_remote_features(sstring local_features_string, std::unordered_map<inet_address, sstring> peer_features_string) const {
|
||||
std::set<sstring> local_features = to_feature_set(local_features_string);
|
||||
auto local_endpoint = get_broadcast_address();
|
||||
auto common_features = get_supported_features(peer_features_string);
|
||||
if (boost::range::includes(local_features, common_features)) {
|
||||
logger.info("Feature check passed. Local node {} features = {}, Remote common_features = {}",
|
||||
local_endpoint, local_features, common_features);
|
||||
} else {
|
||||
throw std::runtime_error(sprint("Feature check failed. This node can not join the cluster because it does not understand the feature. Local node %s features = %s, Remote common_features = %s", local_endpoint, local_features, common_features));
|
||||
}
|
||||
feature_service::feature_service() = default;
|
||||
|
||||
feature_service::~feature_service() = default;
|
||||
|
||||
future<> feature_service::stop() {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
static bool check_features(std::set<sstring> features, std::set<sstring> need_features) {
|
||||
logger.debug("Checking if need_features {} in features {}", need_features, features);
|
||||
return boost::range::includes(features, need_features);
|
||||
void feature_service::register_feature(feature* f) {
|
||||
_registered_features.emplace(f->name(), std::vector<feature*>()).first->second.emplace_back(f);
|
||||
}
|
||||
|
||||
future<> gossiper::wait_for_feature_on_all_node(std::set<sstring> features) {
|
||||
return _features_condvar.wait([this, features = std::move(features)] {
|
||||
return check_features(get_supported_features(), features);
|
||||
});
|
||||
}
|
||||
|
||||
future<> gossiper::wait_for_feature_on_node(std::set<sstring> features, inet_address endpoint) {
|
||||
return _features_condvar.wait([this, features = std::move(features), endpoint = std::move(endpoint)] {
|
||||
return check_features(get_supported_features(endpoint), features);
|
||||
});
|
||||
}
|
||||
|
||||
void gossiper::register_feature(feature* f) {
|
||||
if (check_features(get_local_gossiper().get_supported_features(), {f->name()})) {
|
||||
f->enable();
|
||||
} else {
|
||||
_registered_features.emplace(f->name(), std::vector<feature*>()).first->second.emplace_back(f);
|
||||
}
|
||||
}
|
||||
|
||||
void gossiper::unregister_feature(feature* f) {
|
||||
void feature_service::unregister_feature(feature* f) {
|
||||
auto&& fsit = _registered_features.find(f->name());
|
||||
if (fsit == _registered_features.end()) {
|
||||
return;
|
||||
@@ -2200,66 +2182,61 @@ void gossiper::unregister_feature(feature* f) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void feature_service::enable(const sstring& name) {
|
||||
if (auto it = _registered_features.find(name); it != _registered_features.end()) {
|
||||
for (auto&& f : it->second) {
|
||||
f->enable();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::maybe_enable_features() {
|
||||
if (_registered_features.empty()) {
|
||||
_features_condvar.broadcast();
|
||||
if (!_gossip_settled) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto&& features = get_supported_features();
|
||||
auto loaded_peer_features = db::system_keyspace::load_peer_features().get0();
|
||||
auto&& features = get_supported_features(loaded_peer_features, ignore_features_of_local_node::no);
|
||||
container().invoke_on_all([&features] (gossiper& g) {
|
||||
for (auto it = g._registered_features.begin(); it != g._registered_features.end();) {
|
||||
if (features.find(it->first) != features.end()) {
|
||||
for (auto&& f : it->second) {
|
||||
f->enable();
|
||||
}
|
||||
it = g._registered_features.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
for (auto&& name : features) {
|
||||
g._feature_service.enable(name);
|
||||
}
|
||||
g._features_condvar.broadcast();
|
||||
}).get();
|
||||
}
|
||||
|
||||
feature::feature(sstring name, bool enabled)
|
||||
: _name(name)
|
||||
feature::feature(feature_service& service, sstring name, bool enabled)
|
||||
: _service(&service)
|
||||
, _name(name)
|
||||
, _enabled(enabled) {
|
||||
if (!_enabled) {
|
||||
get_local_gossiper().register_feature(this);
|
||||
} else {
|
||||
_service->register_feature(this);
|
||||
if (_enabled) {
|
||||
_pr.set_value();
|
||||
}
|
||||
}
|
||||
|
||||
feature::~feature() {
|
||||
if (!_enabled) {
|
||||
auto& gossiper = get_gossiper();
|
||||
if (gossiper.local_is_initialized()) {
|
||||
gossiper.local().unregister_feature(this);
|
||||
}
|
||||
if (_service) {
|
||||
_service->unregister_feature(this);
|
||||
}
|
||||
}
|
||||
|
||||
feature& feature::operator=(feature&& other) {
|
||||
if (!_enabled) {
|
||||
get_local_gossiper().unregister_feature(this);
|
||||
}
|
||||
_service->unregister_feature(this);
|
||||
_service = std::exchange(other._service, nullptr);
|
||||
_name = other._name;
|
||||
_enabled = other._enabled;
|
||||
_pr = std::move(other._pr);
|
||||
if (!_enabled) {
|
||||
get_local_gossiper().register_feature(this);
|
||||
}
|
||||
_service->register_feature(this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void feature::enable() {
|
||||
if (engine().cpu_id() == 0) {
|
||||
logger.info("Feature {} is enabled", name());
|
||||
}
|
||||
if (!_enabled) {
|
||||
if (engine().cpu_id() == 0) {
|
||||
logger.info("Feature {} is enabled", name());
|
||||
}
|
||||
_enabled = true;
|
||||
_pr.set_value();
|
||||
}
|
||||
|
||||
@@ -70,6 +70,8 @@ class inet_address;
|
||||
class i_endpoint_state_change_subscriber;
|
||||
class i_failure_detector;
|
||||
|
||||
class feature_service;
|
||||
|
||||
struct bind_messaging_port_tag {};
|
||||
using bind_messaging_port = bool_class<bind_messaging_port_tag>;
|
||||
|
||||
@@ -88,6 +90,7 @@ using bind_messaging_port = bool_class<bind_messaging_port_tag>;
|
||||
class gossiper : public i_failure_detection_event_listener, public seastar::async_sharded_service<gossiper>, public seastar::peering_sharded_service<gossiper> {
|
||||
public:
|
||||
using clk = seastar::lowres_system_clock;
|
||||
using ignore_features_of_local_node = bool_class<class ignore_features_of_local_node_tag>;
|
||||
private:
|
||||
using messaging_verb = netw::messaging_verb;
|
||||
using messaging_service = netw::messaging_service;
|
||||
@@ -153,7 +156,9 @@ public:
|
||||
static constexpr std::chrono::milliseconds INTERVAL{1000};
|
||||
static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};
|
||||
|
||||
/** Maximimum difference in generation and version values we are willing to accept about a peer */
|
||||
// Maximimum difference between remote generation value and generation
|
||||
// value this node would get if this node were restarted that we are
|
||||
// willing to accept about a peer.
|
||||
static constexpr int64_t MAX_GENERATION_DIFFERENCE = 86400 * 365;
|
||||
std::chrono::milliseconds fat_client_timeout;
|
||||
|
||||
@@ -236,7 +241,7 @@ private:
|
||||
// The value must be kept alive until completes and not change.
|
||||
future<> replicate(inet_address, application_state key, const versioned_value& value);
|
||||
public:
|
||||
gossiper();
|
||||
explicit gossiper(feature_service& features);
|
||||
|
||||
void set_last_processed_message_at();
|
||||
void set_last_processed_message_at(clk::time_point tp);
|
||||
@@ -565,29 +570,20 @@ private:
|
||||
uint64_t _msg_processing = 0;
|
||||
bool _ms_registered = false;
|
||||
bool _gossiped_to_seed = false;
|
||||
bool _gossip_settled = false;
|
||||
|
||||
class msg_proc_guard;
|
||||
private:
|
||||
condition_variable _features_condvar;
|
||||
std::unordered_map<sstring, std::vector<feature*>> _registered_features;
|
||||
feature_service& _feature_service;
|
||||
friend class feature;
|
||||
// Get features supported by a particular node
|
||||
std::set<sstring> get_supported_features(inet_address endpoint) const;
|
||||
// Get features supported by all the nodes this node knows about
|
||||
std::set<sstring> get_supported_features() const;
|
||||
// Get features supported by all the nodes listed in the address/feature map
|
||||
static std::set<sstring> get_supported_features(std::unordered_map<gms::inet_address, sstring> peer_features_string);
|
||||
// Wait for features are available on all nodes this node knows about
|
||||
future<> wait_for_feature_on_all_node(std::set<sstring> features);
|
||||
// Wait for features are available on a particular node
|
||||
future<> wait_for_feature_on_node(std::set<sstring> features, inet_address endpoint);
|
||||
std::set<sstring> get_supported_features(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, ignore_features_of_local_node ignore_local_node) const;
|
||||
public:
|
||||
void check_knows_remote_features(sstring local_features_string) const;
|
||||
void check_knows_remote_features(sstring local_features_string, std::unordered_map<inet_address, sstring> peer_features_string) const;
|
||||
void check_knows_remote_features(sstring local_features_string, const std::unordered_map<inet_address, sstring>& loaded_peer_features) const;
|
||||
void maybe_enable_features();
|
||||
private:
|
||||
void register_feature(feature* f);
|
||||
void unregister_feature(feature* f);
|
||||
private:
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
};
|
||||
|
||||
@@ -26,6 +26,6 @@ class partition {
|
||||
|
||||
class reconcilable_result {
|
||||
uint32_t row_count();
|
||||
std::vector<partition> partitions();
|
||||
utils::chunked_vector<partition> partitions();
|
||||
query::short_read is_short_read() [[version 1.6]] = query::short_read::no;
|
||||
};
|
||||
|
||||
@@ -51,4 +51,10 @@ enum class stream_reason : uint8_t {
|
||||
repair,
|
||||
};
|
||||
|
||||
enum class stream_mutation_fragments_cmd : uint8_t {
|
||||
error,
|
||||
mutation_fragment_data,
|
||||
end_of_stream,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -134,6 +134,11 @@ view_ptr secondary_index_manager::create_view_for_index(const index_metadata& im
|
||||
}
|
||||
builder.with_column(col.name(), col.type, column_kind::clustering_key);
|
||||
}
|
||||
if (index_target->is_primary_key()) {
|
||||
for (auto& def : schema->regular_columns()) {
|
||||
db::view::create_virtual_column(builder, def.name(), def.type);
|
||||
}
|
||||
}
|
||||
const sstring where_clause = sprint("%s IS NOT NULL", cql3::util::maybe_quote(index_target_name));
|
||||
builder.with_view_info(*schema, false, where_clause);
|
||||
return view_ptr{builder.build()};
|
||||
|
||||
19
init.cc
19
init.cc
@@ -26,6 +26,8 @@
|
||||
#include "service/storage_service.hh"
|
||||
#include "to_string.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/feature_service.hh"
|
||||
#include "seastarx.hh"
|
||||
|
||||
logging::logger startlog("init");
|
||||
|
||||
@@ -34,13 +36,16 @@ logging::logger startlog("init");
|
||||
// duplicated in cql_test_env.cc
|
||||
// until proper shutdown is done.
|
||||
|
||||
void init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks) {
|
||||
service::init_storage_service(db, auth_service, sys_dist_ks).get();
|
||||
void init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
sharded<gms::feature_service>& feature_service) {
|
||||
service::init_storage_service(db, auth_service, sys_dist_ks, feature_service).get();
|
||||
// #293 - do not stop anything
|
||||
//engine().at_exit([] { return service::deinit_storage_service(); });
|
||||
}
|
||||
|
||||
void init_ms_fd_gossiper(sstring listen_address_in
|
||||
void init_ms_fd_gossiper(sharded<gms::feature_service>& features
|
||||
, db::config& cfg
|
||||
, sstring listen_address_in
|
||||
, uint16_t storage_port
|
||||
, uint16_t ssl_storage_port
|
||||
, bool tcp_nodelay_inter_dc
|
||||
@@ -100,6 +105,8 @@ void init_ms_fd_gossiper(sstring listen_address_in
|
||||
creds->set_x509_trust_file(ms_trust_store, x509_crt_format::PEM).get();
|
||||
}
|
||||
|
||||
creds->set_priority_string(db::config::default_tls_priority);
|
||||
|
||||
if (!ms_tls_prio.empty()) {
|
||||
creds->set_priority_string(ms_tls_prio);
|
||||
}
|
||||
@@ -150,7 +157,11 @@ void init_ms_fd_gossiper(sstring listen_address_in
|
||||
to_string(seeds), listen_address_in, broadcast_address);
|
||||
throw std::runtime_error("Use broadcast_address for seeds list");
|
||||
}
|
||||
gms::get_gossiper().start().get();
|
||||
if ((!cfg.replace_address_first_boot().empty() || !cfg.replace_address().empty()) && seeds.count(broadcast_address)) {
|
||||
startlog.error("Bad configuration: replace-address and replace-address-first-boot are not allowed for seed nodes");
|
||||
throw bad_configuration_error();
|
||||
}
|
||||
gms::get_gossiper().start(std::ref(features)).get();
|
||||
auto& gossiper = gms::get_local_gossiper();
|
||||
gossiper.set_seeds(seeds);
|
||||
// #293 - do not stop anything
|
||||
|
||||
11
init.hh
11
init.hh
@@ -28,16 +28,21 @@
|
||||
#include "db/system_distributed_keyspace.hh"
|
||||
#include "database.hh"
|
||||
#include "log.hh"
|
||||
#include "seastarx.hh"
|
||||
|
||||
namespace db {
|
||||
class extensions;
|
||||
}
|
||||
|
||||
namespace gms {
|
||||
class feature_service;
|
||||
}
|
||||
|
||||
extern logging::logger startlog;
|
||||
|
||||
class bad_configuration_error : public std::exception {};
|
||||
|
||||
void init_storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&);
|
||||
void init_storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&, sharded<gms::feature_service>&);
|
||||
|
||||
struct init_scheduling_config {
|
||||
scheduling_group streaming;
|
||||
@@ -45,7 +50,9 @@ struct init_scheduling_config {
|
||||
scheduling_group gossip;
|
||||
};
|
||||
|
||||
void init_ms_fd_gossiper(sstring listen_address
|
||||
void init_ms_fd_gossiper(sharded<gms::feature_service>& features
|
||||
, db::config& config
|
||||
, sstring listen_address
|
||||
, uint16_t storage_port
|
||||
, uint16_t ssl_storage_port
|
||||
, bool tcp_nodelay_inter_dc
|
||||
|
||||
41
main.cc
41
main.cc
@@ -64,6 +64,7 @@
|
||||
#include "sstables/compaction_manager.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include <db/view/view_update_from_staging_generator.hh>
|
||||
#include "gms/feature_service.hh"
|
||||
|
||||
seastar::metrics::metric_groups app_metrics;
|
||||
|
||||
@@ -301,15 +302,7 @@ int main(int ac, char** av) {
|
||||
auto cfg = make_lw_shared<db::config>(ext);
|
||||
auto init = app.get_options_description().add_options();
|
||||
|
||||
// If --version is requested, print it out and exit immediately to avoid
|
||||
// Seastar-specific warnings that may occur when running the app
|
||||
init("version", bpo::bool_switch(), "print version number and exit");
|
||||
bpo::variables_map vm;
|
||||
bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
|
||||
if (vm["version"].as<bool>()) {
|
||||
print("%s\n", scylla_version());
|
||||
return 0;
|
||||
}
|
||||
|
||||
bpo::options_description deprecated("Deprecated options - ignored");
|
||||
deprecated.add_options()
|
||||
@@ -323,6 +316,15 @@ int main(int ac, char** av) {
|
||||
configurable::append_all(*cfg, init);
|
||||
cfg->add_options(init);
|
||||
|
||||
// If --version is requested, print it out and exit immediately to avoid
|
||||
// Seastar-specific warnings that may occur when running the app
|
||||
bpo::variables_map vm;
|
||||
bpo::store(bpo::command_line_parser(ac, av).options(app.get_options_description()).allow_unregistered().run(), vm);
|
||||
if (vm["version"].as<bool>()) {
|
||||
print("%s\n", scylla_version());
|
||||
return 0;
|
||||
}
|
||||
|
||||
distributed<database> db;
|
||||
seastar::sharded<service::cache_hitrate_calculator> cf_cache_hitrate_calculator;
|
||||
debug::db = &db;
|
||||
@@ -333,6 +335,7 @@ int main(int ac, char** av) {
|
||||
httpd::http_server_control prometheus_server;
|
||||
prometheus::config pctx;
|
||||
directories dirs;
|
||||
sharded<gms::feature_service> feature_service;
|
||||
|
||||
return app.run_deprecated(ac, av, [&] {
|
||||
|
||||
@@ -360,7 +363,8 @@ int main(int ac, char** av) {
|
||||
|
||||
tcp_syncookies_sanity();
|
||||
|
||||
return seastar::async([cfg, ext, &db, &qp, &proxy, &mm, &ctx, &opts, &dirs, &pctx, &prometheus_server, &return_value, &cf_cache_hitrate_calculator] {
|
||||
return seastar::async([cfg, ext, &db, &qp, &proxy, &mm, &ctx, &opts, &dirs, &pctx, &prometheus_server, &return_value, &cf_cache_hitrate_calculator,
|
||||
&feature_service] {
|
||||
read_config(opts, *cfg).get();
|
||||
configurable::init_all(opts, *cfg, *ext).get();
|
||||
|
||||
@@ -380,6 +384,8 @@ int main(int ac, char** av) {
|
||||
throw bad_configuration_error();
|
||||
}
|
||||
}
|
||||
feature_service.start().get();
|
||||
// FIXME: feature_service.stop(), when we fix up shutdown
|
||||
dht::set_global_partitioner(cfg->partitioner(), cfg->murmur3_partitioner_ignore_msb_bits());
|
||||
auto make_sched_group = [&] (sstring name, unsigned shares) {
|
||||
if (cfg->cpu_scheduler()) {
|
||||
@@ -478,6 +484,9 @@ int main(int ac, char** av) {
|
||||
if (opts.count("developer-mode")) {
|
||||
smp::invoke_on_all([] { engine().set_strict_dma(false); }).get();
|
||||
}
|
||||
|
||||
set_abort_on_internal_error(cfg->abort_on_internal_error());
|
||||
|
||||
supervisor::notify("creating tracing");
|
||||
tracing::tracing::create_tracing("trace_keyspace_helper").get();
|
||||
supervisor::notify("creating snitch");
|
||||
@@ -503,7 +512,7 @@ int main(int ac, char** av) {
|
||||
static sharded<auth::service> auth_service;
|
||||
static sharded<db::system_distributed_keyspace> sys_dist_ks;
|
||||
supervisor::notify("initializing storage service");
|
||||
init_storage_service(db, auth_service, sys_dist_ks);
|
||||
init_storage_service(db, auth_service, sys_dist_ks, feature_service);
|
||||
supervisor::notify("starting per-shard database core");
|
||||
|
||||
// Note: changed from using a move here, because we want the config object intact.
|
||||
@@ -599,7 +608,9 @@ int main(int ac, char** av) {
|
||||
scfg.statement = dbcfg.statement_scheduling_group;
|
||||
scfg.streaming = dbcfg.streaming_scheduling_group;
|
||||
scfg.gossip = scheduling_group();
|
||||
init_ms_fd_gossiper(listen_address
|
||||
init_ms_fd_gossiper(feature_service
|
||||
, *cfg
|
||||
, listen_address
|
||||
, storage_port
|
||||
, ssl_storage_port
|
||||
, tcp_nodelay_inter_dc
|
||||
@@ -780,6 +791,7 @@ int main(int ac, char** av) {
|
||||
});
|
||||
|
||||
api::set_server_cache(ctx);
|
||||
startlog.info("Waiting for gossip to settle before accepting client requests...");
|
||||
gms::get_local_gossiper().wait_for_gossip_to_settle().get();
|
||||
api::set_server_gossip_settle(ctx).get();
|
||||
|
||||
@@ -839,8 +851,11 @@ int main(int ac, char** av) {
|
||||
return service::get_local_storage_service().drain_on_shutdown();
|
||||
});
|
||||
|
||||
engine().at_exit([] {
|
||||
return view_builder.stop();
|
||||
engine().at_exit([cfg] {
|
||||
if (cfg->view_building()) {
|
||||
return view_builder.stop();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
|
||||
engine().at_exit([&db] {
|
||||
|
||||
32
memtable.cc
32
memtable.cc
@@ -24,7 +24,6 @@
|
||||
#include "frozen_mutation.hh"
|
||||
#include "stdx.hh"
|
||||
#include "partition_snapshot_reader.hh"
|
||||
#include "schema_upgrader.hh"
|
||||
#include "partition_builder.hh"
|
||||
|
||||
memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list,
|
||||
@@ -343,11 +342,8 @@ public:
|
||||
bool digest_requested = _slice.options.contains<query::partition_slice::option::with_digest>();
|
||||
auto mpsr = make_partition_snapshot_flat_reader(snp_schema, std::move(key_and_snp->first), std::move(cr),
|
||||
std::move(key_and_snp->second), digest_requested, region(), read_section(), mtbl(), streamed_mutation::forwarding::no);
|
||||
if (snp_schema->version() != schema()->version()) {
|
||||
_delegate = transform(std::move(mpsr), schema_upgrader(schema()));
|
||||
} else {
|
||||
_delegate = std::move(mpsr);
|
||||
}
|
||||
mpsr.upgrade_schema(schema());
|
||||
_delegate = std::move(mpsr);
|
||||
} else {
|
||||
_end_of_stream = true;
|
||||
}
|
||||
@@ -502,11 +498,8 @@ private:
|
||||
auto snp_schema = key_and_snp->second->schema();
|
||||
auto mpsr = make_partition_snapshot_flat_reader<partition_snapshot_accounter>(snp_schema, std::move(key_and_snp->first), std::move(cr),
|
||||
std::move(key_and_snp->second), false, region(), read_section(), mtbl(), streamed_mutation::forwarding::no, *snp_schema, _flushed_memory);
|
||||
if (snp_schema->version() != schema()->version()) {
|
||||
_partition_reader = transform(std::move(mpsr), schema_upgrader(schema()));
|
||||
} else {
|
||||
_partition_reader = std::move(mpsr);
|
||||
}
|
||||
mpsr.upgrade_schema(schema());
|
||||
_partition_reader = std::move(mpsr);
|
||||
}
|
||||
}
|
||||
public:
|
||||
@@ -582,11 +575,8 @@ memtable::make_flat_reader(schema_ptr s,
|
||||
bool digest_requested = slice.options.contains<query::partition_slice::option::with_digest>();
|
||||
auto rd = make_partition_snapshot_flat_reader(snp_schema, std::move(dk), std::move(cr), std::move(snp), digest_requested,
|
||||
*this, _read_section, shared_from_this(), fwd);
|
||||
if (snp_schema->version() != s->version()) {
|
||||
return transform(std::move(rd), schema_upgrader(s));
|
||||
} else {
|
||||
return rd;
|
||||
}
|
||||
rd.upgrade_schema(s);
|
||||
return rd;
|
||||
} else {
|
||||
auto res = make_flat_mutation_reader<scanning_reader>(std::move(s), shared_from_this(), range, slice, pc, fwd_mr);
|
||||
if (fwd == streamed_mutation::forwarding::yes) {
|
||||
@@ -701,13 +691,19 @@ bool memtable::is_flushed() const {
|
||||
return bool(_underlying);
|
||||
}
|
||||
|
||||
void memtable_entry::upgrade_schema(const schema_ptr& s, mutation_cleaner& cleaner) {
|
||||
if (_schema != s) {
|
||||
partition().upgrade(_schema, s, cleaner, no_cache_tracker);
|
||||
_schema = s;
|
||||
}
|
||||
}
|
||||
|
||||
void memtable::upgrade_entry(memtable_entry& e) {
|
||||
if (e._schema != _schema) {
|
||||
assert(!reclaiming_enabled());
|
||||
with_allocator(allocator(), [this, &e] {
|
||||
with_linearized_managed_bytes([&] {
|
||||
e.partition().upgrade(e._schema, _schema, cleaner(), no_cache_tracker);
|
||||
e._schema = _schema;
|
||||
e.upgrade_schema(_schema, cleaner());
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -68,6 +68,10 @@ public:
|
||||
schema_ptr& schema() { return _schema; }
|
||||
partition_snapshot_ptr snapshot(memtable& mtbl);
|
||||
|
||||
// Makes the entry conform to given schema.
|
||||
// Must be called under allocating section of the region which owns the entry.
|
||||
void upgrade_schema(const schema_ptr&, mutation_cleaner&);
|
||||
|
||||
size_t external_memory_usage_without_rows() const {
|
||||
return _key.key().external_memory_usage();
|
||||
}
|
||||
|
||||
@@ -87,6 +87,7 @@
|
||||
#include "frozen_mutation.hh"
|
||||
#include "flat_mutation_reader.hh"
|
||||
#include "streaming/stream_manager.hh"
|
||||
#include "streaming/stream_mutation_fragments_cmd.hh"
|
||||
|
||||
namespace netw {
|
||||
|
||||
@@ -281,25 +282,26 @@ void messaging_service::start_listen() {
|
||||
if (_compress_what != compress_what::none) {
|
||||
so.compressor_factory = &compressor_factory;
|
||||
}
|
||||
so.streaming_domain = rpc::streaming_domain_type(0x55AA);
|
||||
// FIXME: we don't set so.tcp_nodelay, because we can't tell at this point whether the connection will come from a
|
||||
// local or remote datacenter, and whether or not the connection will be used for gossip. We can fix
|
||||
// the first by wrapping its server_socket, but not the second.
|
||||
auto limits = rpc_resource_limits(_mcfg.rpc_memory_limit);
|
||||
if (!_server[0]) {
|
||||
auto listen = [&] (const gms::inet_address& a) {
|
||||
auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
|
||||
so.streaming_domain = sdomain;
|
||||
auto addr = ipv4_addr{a.raw_addr(), _port};
|
||||
return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(*_rpc,
|
||||
so, addr, limits));
|
||||
};
|
||||
_server[0] = listen(_listen_address);
|
||||
_server[0] = listen(_listen_address, rpc::streaming_domain_type(0x55AA));
|
||||
if (listen_to_bc) {
|
||||
_server[1] = listen(utils::fb_utilities::get_broadcast_address());
|
||||
_server[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x66BB));
|
||||
}
|
||||
}
|
||||
|
||||
if (!_server_tls[0]) {
|
||||
auto listen = [&] (const gms::inet_address& a) {
|
||||
auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
|
||||
so.streaming_domain = sdomain;
|
||||
return std::unique_ptr<rpc_protocol_server_wrapper>(
|
||||
[this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
|
||||
if (_encrypt_what == encrypt_what::none) {
|
||||
@@ -312,9 +314,9 @@ void messaging_service::start_listen() {
|
||||
so, seastar::tls::listen(_credentials, addr, lo), limits);
|
||||
}());
|
||||
};
|
||||
_server_tls[0] = listen(_listen_address);
|
||||
_server_tls[0] = listen(_listen_address, rpc::streaming_domain_type(0x77CC));
|
||||
if (listen_to_bc) {
|
||||
_server_tls[1] = listen(utils::fb_utilities::get_broadcast_address());
|
||||
_server_tls[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x88DD));
|
||||
}
|
||||
}
|
||||
// Do this on just cpu 0, to avoid duplicate logs.
|
||||
@@ -592,6 +594,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
opts.compressor_factory = &compressor_factory;
|
||||
}
|
||||
opts.tcp_nodelay = must_tcp_nodelay;
|
||||
opts.reuseaddr = true;
|
||||
|
||||
auto client = must_encrypt ?
|
||||
::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
|
||||
@@ -651,23 +654,27 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
|
||||
return _rpc;
|
||||
}
|
||||
|
||||
rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source) {
|
||||
rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source) {
|
||||
return source.make_sink<netw::serializer, int32_t>();
|
||||
}
|
||||
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
|
||||
future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>
|
||||
messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
|
||||
auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
|
||||
rpc_protocol::client& rpc_client = *wrapper;
|
||||
return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
|
||||
auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then([sink] (rpc::source<int32_t> source) mutable {
|
||||
return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
|
||||
if (is_stopping()) {
|
||||
return make_exception_future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>(rpc::closed_error());
|
||||
}
|
||||
auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
|
||||
return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
|
||||
auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
return rpc_handler(*rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then_wrapped([sink, rpc_client] (future<rpc::source<int32_t>> source) mutable {
|
||||
return (source.failed() ? sink.close() : make_ready_future<>()).then([sink = std::move(sink), source = std::move(source)] () mutable {
|
||||
return make_ready_future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>(std::move(sink), std::move(source.get0()));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
|
||||
void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func) {
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
|
||||
}
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
#include "tracing/tracing.hh"
|
||||
#include "digest_algorithm.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
#include "streaming/stream_mutation_fragments_cmd.hh"
|
||||
|
||||
#include <seastar/net/tls.hh>
|
||||
|
||||
@@ -256,9 +257,9 @@ public:
|
||||
|
||||
// Wrapper for STREAM_MUTATION_FRAGMENTS
|
||||
// The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
|
||||
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
|
||||
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
|
||||
future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
|
||||
|
||||
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
|
||||
|
||||
@@ -1162,6 +1162,7 @@ row::apply_monotonically(const column_definition& column, atomic_cell_or_collect
|
||||
void
|
||||
row::append_cell(column_id id, atomic_cell_or_collection value) {
|
||||
if (_type == storage_type::vector && id < max_vector_size) {
|
||||
assert(_storage.vector.v.size() <= id);
|
||||
_storage.vector.v.resize(id);
|
||||
_storage.vector.v.emplace_back(cell_and_hash{std::move(value), cell_hash_opt()});
|
||||
_storage.vector.present.set(id);
|
||||
|
||||
@@ -387,7 +387,7 @@ public:
|
||||
if (is_missing() || _ttl == dead) {
|
||||
return false;
|
||||
}
|
||||
if (_ttl != no_ttl && _expiry < now) {
|
||||
if (_ttl != no_ttl && _expiry <= now) {
|
||||
return false;
|
||||
}
|
||||
return _timestamp > t.timestamp;
|
||||
@@ -397,7 +397,7 @@ public:
|
||||
if (_ttl == dead) {
|
||||
return true;
|
||||
}
|
||||
return _ttl != no_ttl && _expiry < now;
|
||||
return _ttl != no_ttl && _expiry <= now;
|
||||
}
|
||||
// Can be called only when is_live().
|
||||
bool is_expiring() const {
|
||||
@@ -435,7 +435,7 @@ public:
|
||||
_timestamp = api::missing_timestamp;
|
||||
return false;
|
||||
}
|
||||
if (_ttl > no_ttl && _expiry < now) {
|
||||
if (_ttl > no_ttl && _expiry <= now) {
|
||||
_expiry -= _ttl;
|
||||
_ttl = dead;
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ reconcilable_result::reconcilable_result()
|
||||
: _row_count(0)
|
||||
{ }
|
||||
|
||||
reconcilable_result::reconcilable_result(uint32_t row_count, std::vector<partition> p, query::short_read short_read,
|
||||
reconcilable_result::reconcilable_result(uint32_t row_count, utils::chunked_vector<partition> p, query::short_read short_read,
|
||||
query::result_memory_tracker memory_tracker)
|
||||
: _row_count(row_count)
|
||||
, _short_read(short_read)
|
||||
@@ -39,11 +39,11 @@ reconcilable_result::reconcilable_result(uint32_t row_count, std::vector<partiti
|
||||
, _partitions(std::move(p))
|
||||
{ }
|
||||
|
||||
const std::vector<partition>& reconcilable_result::partitions() const {
|
||||
const utils::chunked_vector<partition>& reconcilable_result::partitions() const {
|
||||
return _partitions;
|
||||
}
|
||||
|
||||
std::vector<partition>& reconcilable_result::partitions() {
|
||||
utils::chunked_vector<partition>& reconcilable_result::partitions() {
|
||||
return _partitions;
|
||||
}
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "frozen_mutation.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
#include "querier.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
#include <seastar/core/execution_stage.hh>
|
||||
|
||||
class reconcilable_result;
|
||||
@@ -72,17 +73,17 @@ class reconcilable_result {
|
||||
uint32_t _row_count;
|
||||
query::short_read _short_read;
|
||||
query::result_memory_tracker _memory_tracker;
|
||||
std::vector<partition> _partitions;
|
||||
utils::chunked_vector<partition> _partitions;
|
||||
public:
|
||||
~reconcilable_result();
|
||||
reconcilable_result();
|
||||
reconcilable_result(reconcilable_result&&) = default;
|
||||
reconcilable_result& operator=(reconcilable_result&&) = default;
|
||||
reconcilable_result(uint32_t row_count, std::vector<partition> partitions, query::short_read short_read,
|
||||
reconcilable_result(uint32_t row_count, utils::chunked_vector<partition> partitions, query::short_read short_read,
|
||||
query::result_memory_tracker memory_tracker = { });
|
||||
|
||||
const std::vector<partition>& partitions() const;
|
||||
std::vector<partition>& partitions();
|
||||
const utils::chunked_vector<partition>& partitions() const;
|
||||
utils::chunked_vector<partition>& partitions();
|
||||
|
||||
uint32_t row_count() const {
|
||||
return _row_count;
|
||||
@@ -112,7 +113,7 @@ class reconcilable_result_builder {
|
||||
const schema& _schema;
|
||||
const query::partition_slice& _slice;
|
||||
|
||||
std::vector<partition> _result;
|
||||
utils::chunked_vector<partition> _result;
|
||||
uint32_t _live_rows{};
|
||||
|
||||
bool _has_ck_selector{};
|
||||
|
||||
@@ -764,6 +764,8 @@ class foreign_reader : public flat_mutation_reader::impl {
|
||||
}
|
||||
|
||||
void update_buffer_with(foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam);
|
||||
|
||||
static future<> ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader, fragment_buffer& buffer);
|
||||
public:
|
||||
foreign_reader(schema_ptr schema,
|
||||
foreign_unique_ptr<flat_mutation_reader> reader,
|
||||
@@ -799,6 +801,39 @@ void foreign_reader::update_buffer_with(foreign_unique_ptr<fragment_buffer> buff
|
||||
}
|
||||
}
|
||||
|
||||
future<> foreign_reader::ensure_buffer_contains_all_fragments_for_last_pos(flat_mutation_reader& reader, fragment_buffer& buffer) {
|
||||
if (buffer.empty() || !buffer.back().is_range_tombstone()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto stop = [&reader, &buffer] {
|
||||
if (reader.is_buffer_empty()) {
|
||||
return reader.is_end_of_stream();
|
||||
}
|
||||
if (!buffer.back().is_range_tombstone()) {
|
||||
return true;
|
||||
}
|
||||
const auto next_pos = reader.peek_buffer().position();
|
||||
const auto& last_key = buffer.back().key();
|
||||
|
||||
// Ending the buffer on a non-full prefix key position is
|
||||
// problematic because when recreating the reader we continue
|
||||
// from *after* the last key we saw. If this is a prefix this
|
||||
// would exclude all clustering positions that fall into the
|
||||
// prefix. Fixing this is non-trivial and has little gain over
|
||||
// just making sure we don't end the buffer on a prefix.
|
||||
return last_key.is_full(*reader.schema()) && !next_pos.key().equal(*reader.schema(), last_key);
|
||||
};
|
||||
|
||||
return do_until(stop, [&reader, &buffer] {
|
||||
if (reader.is_buffer_empty()) {
|
||||
return reader.fill_buffer(db::no_timeout);
|
||||
}
|
||||
buffer.emplace_back(reader.pop_mutation_fragment());
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
foreign_reader::foreign_reader(schema_ptr schema,
|
||||
foreign_unique_ptr<flat_mutation_reader> reader,
|
||||
streamed_mutation::forwarding fwd_sm)
|
||||
@@ -896,9 +931,29 @@ future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> foreign_reader::pause
|
||||
if (pending_next_partition) {
|
||||
reader->next_partition();
|
||||
}
|
||||
return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
|
||||
std::make_unique<fragment_buffer>(reader->detach_buffer()),
|
||||
reader->is_end_of_stream());
|
||||
auto buffer = reader->detach_buffer();
|
||||
if (buffer.empty() || !buffer.back().is_range_tombstone()) {
|
||||
return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
|
||||
std::make_unique<fragment_buffer>(std::move(buffer)),
|
||||
reader->is_end_of_stream());
|
||||
}
|
||||
// When the reader is recreated (after having been evicted) we
|
||||
// recreate it such that it starts reading from *after* the last
|
||||
// seen fragment's position. If the last seen fragment is a range
|
||||
// tombstone it is *not* guaranteed that the next fragments in the
|
||||
// data stream have positions strictly greater than the range
|
||||
// tombstone's. If the reader is evicted and has to be recreated,
|
||||
// these fragments would be then skipped as the read would continue
|
||||
// after their position.
|
||||
// To avoid this ensure that the buffer contains *all* fragments for
|
||||
// the last seen position.
|
||||
return do_with(std::move(buffer), [reader] (fragment_buffer& buffer) mutable {
|
||||
return ensure_buffer_contains_all_fragments_for_last_pos(*reader, buffer).then([reader, &buffer] () mutable {
|
||||
return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
|
||||
std::make_unique<fragment_buffer>(std::move(buffer)),
|
||||
reader->is_end_of_stream() && reader->is_buffer_empty());
|
||||
});
|
||||
});
|
||||
});
|
||||
}).then([this] (foreign_unique_ptr<fragment_buffer>&& buffer, bool end_of_stream) mutable {
|
||||
update_buffer_with(std::move(buffer), end_of_stream);
|
||||
|
||||
@@ -172,6 +172,9 @@ tombstone partition_entry::partition_tombstone() const {
|
||||
|
||||
partition_snapshot::~partition_snapshot() {
|
||||
with_allocator(region().allocator(), [this] {
|
||||
if (_locked) {
|
||||
touch();
|
||||
}
|
||||
if (_version && _version.is_unique_owner()) {
|
||||
auto v = &*_version;
|
||||
_version = {};
|
||||
@@ -268,6 +271,7 @@ partition_entry::~partition_entry() {
|
||||
return;
|
||||
}
|
||||
if (_snapshot) {
|
||||
assert(!_snapshot->is_locked());
|
||||
_snapshot->_version = std::move(_version);
|
||||
_snapshot->_version.mark_as_unique_owner();
|
||||
_snapshot->_entry = nullptr;
|
||||
@@ -284,6 +288,7 @@ stop_iteration partition_entry::clear_gently(cache_tracker* tracker) noexcept {
|
||||
}
|
||||
|
||||
if (_snapshot) {
|
||||
assert(!_snapshot->is_locked());
|
||||
_snapshot->_version = std::move(_version);
|
||||
_snapshot->_version.mark_as_unique_owner();
|
||||
_snapshot->_entry = nullptr;
|
||||
@@ -311,6 +316,7 @@ stop_iteration partition_entry::clear_gently(cache_tracker* tracker) noexcept {
|
||||
void partition_entry::set_version(partition_version* new_version)
|
||||
{
|
||||
if (_snapshot) {
|
||||
assert(!_snapshot->is_locked());
|
||||
_snapshot->_version = std::move(_version);
|
||||
_snapshot->_entry = nullptr;
|
||||
}
|
||||
@@ -459,7 +465,6 @@ public:
|
||||
|
||||
coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
partition_entry&& pe,
|
||||
const schema& pe_schema,
|
||||
mutation_cleaner& pe_cleaner,
|
||||
logalloc::allocating_section& alloc,
|
||||
logalloc::region& reg,
|
||||
@@ -479,10 +484,6 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
// partitions where I saw 40% slow down.
|
||||
const bool preemptible = s.clustering_key_size() > 0;
|
||||
|
||||
if (s.version() != pe_schema.version()) {
|
||||
pe.upgrade(pe_schema.shared_from_this(), s.shared_from_this(), pe_cleaner, no_cache_tracker);
|
||||
}
|
||||
|
||||
// When preemptible, later memtable reads could start using the snapshot before
|
||||
// snapshot's writes are made visible in cache, which would cause them to miss those writes.
|
||||
// So we cannot allow erasing when preemptible.
|
||||
@@ -496,6 +497,7 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
prev_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase - 1);
|
||||
}
|
||||
auto dst_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase);
|
||||
dst_snp->lock();
|
||||
|
||||
// Once we start updating the partition, we must keep all snapshots until the update completes,
|
||||
// otherwise partial writes would be published. So the scope of snapshots must enclose the scope
|
||||
@@ -570,6 +572,7 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
auto has_next = src_cur.erase_and_advance();
|
||||
acc.unpin_memory(size);
|
||||
if (!has_next) {
|
||||
dst_snp->unlock();
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
} while (!preemptible || !need_preempt());
|
||||
@@ -661,6 +664,18 @@ partition_snapshot::range_tombstones()
|
||||
position_in_partition_view::after_all_clustered_rows());
|
||||
}
|
||||
|
||||
void partition_snapshot::touch() noexcept {
|
||||
// Eviction assumes that older versions are evicted before newer so only the latest snapshot
|
||||
// can be touched.
|
||||
if (_tracker && at_latest_version()) {
|
||||
auto&& rows = version()->partition().clustered_rows();
|
||||
assert(!rows.empty());
|
||||
rows_entry& last_dummy = *rows.rbegin();
|
||||
assert(last_dummy.is_last_dummy());
|
||||
_tracker->touch(last_dummy);
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const partition_entry& e) {
|
||||
out << "{";
|
||||
bool first = true;
|
||||
@@ -687,6 +702,7 @@ void partition_entry::evict(mutation_cleaner& cleaner) noexcept {
|
||||
return;
|
||||
}
|
||||
if (_snapshot) {
|
||||
assert(!_snapshot->is_locked());
|
||||
_snapshot->_version = std::move(_version);
|
||||
_snapshot->_version.mark_as_unique_owner();
|
||||
_snapshot->_entry = nullptr;
|
||||
@@ -706,3 +722,18 @@ partition_snapshot_ptr::~partition_snapshot_ptr() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void partition_snapshot::lock() noexcept {
|
||||
// partition_entry::is_locked() assumes that if there is a locked snapshot,
|
||||
// it can be found attached directly to it.
|
||||
assert(at_latest_version());
|
||||
_locked = true;
|
||||
}
|
||||
|
||||
void partition_snapshot::unlock() noexcept {
|
||||
// Locked snapshots must always be latest, is_locked() assumes that.
|
||||
// Also, touch() is only effective when this snapshot is latest.
|
||||
assert(at_latest_version());
|
||||
_locked = false;
|
||||
touch(); // Make the entry evictable again in case it was fully unlinked by eviction attempt.
|
||||
}
|
||||
|
||||
@@ -303,6 +303,7 @@ private:
|
||||
mutation_cleaner* _cleaner;
|
||||
cache_tracker* _tracker;
|
||||
boost::intrusive::slist_member_hook<> _cleaner_hook;
|
||||
bool _locked = false;
|
||||
friend class partition_entry;
|
||||
friend class mutation_cleaner_impl;
|
||||
public:
|
||||
@@ -318,6 +319,22 @@ public:
|
||||
partition_snapshot& operator=(const partition_snapshot&) = delete;
|
||||
partition_snapshot& operator=(partition_snapshot&&) = delete;
|
||||
|
||||
// Makes the snapshot locked.
|
||||
// See is_locked() for meaning.
|
||||
// Can be called only when at_lastest_version(). The snapshot must remain latest as long as it's locked.
|
||||
void lock() noexcept;
|
||||
|
||||
// Makes the snapshot no longer locked.
|
||||
// See is_locked() for meaning.
|
||||
void unlock() noexcept;
|
||||
|
||||
// Tells whether the snapshot is locked.
|
||||
// Locking the snapshot prevents it from getting detached from the partition entry.
|
||||
// It also prevents the partition entry from being evicted.
|
||||
bool is_locked() const {
|
||||
return _locked;
|
||||
}
|
||||
|
||||
static partition_snapshot& container_of(partition_version_ref* ref) {
|
||||
return *boost::intrusive::get_parent_from_member(ref, &partition_snapshot::_version);
|
||||
}
|
||||
@@ -344,6 +361,9 @@ public:
|
||||
// to the latest version.
|
||||
stop_iteration slide_to_oldest() noexcept;
|
||||
|
||||
// Brings the snapshot to the front of the LRU.
|
||||
void touch() noexcept;
|
||||
|
||||
// Must be called after snapshot's original region is merged into a different region
|
||||
// before the original region is destroyed, unless the snapshot is destroyed earlier.
|
||||
void migrate(logalloc::region* region, mutation_cleaner* cleaner) noexcept {
|
||||
@@ -503,9 +523,18 @@ public:
|
||||
return _version->all_elements_reversed();
|
||||
}
|
||||
|
||||
// Tells whether this entry is locked.
|
||||
// Locked entries are undergoing an update and should not have their snapshots
|
||||
// detached from the entry.
|
||||
// Certain methods can only be called when !is_locked().
|
||||
bool is_locked() const {
|
||||
return _snapshot && _snapshot->is_locked();
|
||||
}
|
||||
|
||||
// Strong exception guarantees.
|
||||
// Assumes this instance and mp are fully continuous.
|
||||
// Use only on non-evictable entries.
|
||||
// Must not be called when is_locked().
|
||||
void apply(const schema& s, const mutation_partition& mp, const schema& mp_schema);
|
||||
void apply(const schema& s, mutation_partition&& mp, const schema& mp_schema);
|
||||
|
||||
@@ -526,11 +555,14 @@ public:
|
||||
// such that if the operation is retried (possibly many times) and eventually
|
||||
// succeeds the result will be as if the first attempt didn't fail.
|
||||
//
|
||||
// The schema of pe must conform to s.
|
||||
//
|
||||
// Returns a coroutine object representing the operation.
|
||||
// The coroutine must be resumed with the region being unlocked.
|
||||
//
|
||||
// The coroutine cannot run concurrently with other apply() calls.
|
||||
coroutine apply_to_incomplete(const schema& s,
|
||||
partition_entry&& pe,
|
||||
const schema& pe_schema,
|
||||
mutation_cleaner& pe_cleaner,
|
||||
logalloc::allocating_section&,
|
||||
logalloc::region&,
|
||||
@@ -539,6 +571,7 @@ public:
|
||||
real_dirty_memory_accounter&);
|
||||
|
||||
// If this entry is evictable, cache_tracker must be provided.
|
||||
// Must not be called when is_locked().
|
||||
partition_version& add_version(const schema& s, cache_tracker*);
|
||||
|
||||
// Returns a reference to existing version with an active snapshot of given phase
|
||||
@@ -568,9 +601,11 @@ public:
|
||||
tombstone partition_tombstone() const;
|
||||
|
||||
// needs to be called with reclaiming disabled
|
||||
// Must not be called when is_locked().
|
||||
void upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&, cache_tracker*);
|
||||
|
||||
// Snapshots with different values of phase will point to different partition_version objects.
|
||||
// When is_locked(), read() can only be called with a phase which is <= the phase of the current snapshot.
|
||||
partition_snapshot_ptr read(logalloc::region& region,
|
||||
mutation_cleaner&,
|
||||
schema_ptr entry_schema,
|
||||
|
||||
@@ -151,6 +151,7 @@ public:
|
||||
return {partition_region::clustered, 1, &ck};
|
||||
}
|
||||
|
||||
partition_region region() const { return _type; }
|
||||
bool is_partition_start() const { return _type == partition_region::partition_start; }
|
||||
bool is_partition_end() const { return _type == partition_region::partition_end; }
|
||||
bool is_static_row() const { return _type == partition_region::static_row; }
|
||||
|
||||
@@ -288,11 +288,11 @@ static void insert_querier(
|
||||
|
||||
auto& e = entries.emplace_back(key, std::move(q), expires);
|
||||
e.set_pos(--entries.end());
|
||||
++stats.population;
|
||||
|
||||
if (auto irh = sem.register_inactive_read(std::make_unique<querier_inactive_read>(entries, e.pos(), stats))) {
|
||||
e.set_inactive_handle(irh);
|
||||
index.insert(e);
|
||||
++stats.population;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ class autoupdating_underlying_reader final {
|
||||
row_cache& _cache;
|
||||
read_context& _read_context;
|
||||
stdx::optional<flat_mutation_reader> _reader;
|
||||
utils::phased_barrier::phase_type _reader_creation_phase;
|
||||
utils::phased_barrier::phase_type _reader_creation_phase = 0;
|
||||
dht::partition_range _range = { };
|
||||
stdx::optional<dht::decorated_key> _last_key;
|
||||
stdx::optional<dht::decorated_key> _new_last_key;
|
||||
@@ -105,7 +105,6 @@ public:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
utils::phased_barrier::phase_type creation_phase() const {
|
||||
assert(_reader);
|
||||
return _reader_creation_phase;
|
||||
}
|
||||
const dht::partition_range& range() const {
|
||||
@@ -192,7 +191,7 @@ public:
|
||||
const dht::decorated_key& key() const { return *_key; }
|
||||
void on_underlying_created() { ++_underlying_created; }
|
||||
bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
|
||||
private:
|
||||
public:
|
||||
future<> ensure_underlying(db::timeout_clock::time_point timeout) {
|
||||
if (_underlying_snapshot) {
|
||||
return create_underlying(true, timeout);
|
||||
@@ -211,18 +210,6 @@ public:
|
||||
_underlying_snapshot = {};
|
||||
_key = dk;
|
||||
}
|
||||
// Fast forwards the underlying streamed_mutation to given range.
|
||||
future<> fast_forward_to(position_range range, db::timeout_clock::time_point timeout) {
|
||||
return ensure_underlying(timeout).then([this, range = std::move(range), timeout] {
|
||||
return _underlying.underlying().fast_forward_to(std::move(range), timeout);
|
||||
});
|
||||
}
|
||||
// Gets the next fragment from the underlying reader
|
||||
future<mutation_fragment_opt> get_next_fragment(db::timeout_clock::time_point timeout) {
|
||||
return ensure_underlying(timeout).then([this, timeout] {
|
||||
return _underlying.underlying()(timeout);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -814,8 +814,10 @@ static future<> repair_cf_range(repair_info& ri,
|
||||
// still do our best to repair available replicas.
|
||||
std::vector<gms::inet_address> live_neighbors;
|
||||
std::vector<partition_checksum> live_neighbors_checksum;
|
||||
bool local_checksum_failed = false;
|
||||
for (unsigned i = 0; i < checksums.size(); i++) {
|
||||
if (checksums[i].failed()) {
|
||||
local_checksum_failed |= (i == 0);
|
||||
rlogger.warn(
|
||||
"Checksum of range {} on {} failed: {}",
|
||||
range,
|
||||
@@ -831,7 +833,7 @@ static future<> repair_cf_range(repair_info& ri,
|
||||
live_neighbors_checksum.push_back(checksums[i].get0());
|
||||
}
|
||||
}
|
||||
if (checksums[0].failed() || live_neighbors.empty()) {
|
||||
if (local_checksum_failed || live_neighbors.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
// If one of the available checksums is different, repair
|
||||
|
||||
36
row_cache.cc
36
row_cache.cc
@@ -32,7 +32,6 @@
|
||||
#include <sys/sdt.h>
|
||||
#include "stdx.hh"
|
||||
#include "read_context.hh"
|
||||
#include "schema_upgrader.hh"
|
||||
#include "dirty_memory_manager.hh"
|
||||
#include "cache_flat_mutation_reader.hh"
|
||||
#include "real_dirty_memory_accounter.hh"
|
||||
@@ -350,13 +349,11 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c
|
||||
|
||||
static flat_mutation_reader read_directly_from_underlying(read_context& reader) {
|
||||
flat_mutation_reader res = make_delegating_reader(reader.underlying().underlying());
|
||||
if (reader.schema()->version() != reader.underlying().underlying().schema()->version()) {
|
||||
res = transform(std::move(res), schema_upgrader(reader.schema()));
|
||||
}
|
||||
if (reader.fwd() == streamed_mutation::forwarding::no) {
|
||||
res = make_nonforwardable(std::move(res), true);
|
||||
}
|
||||
return std::move(res);
|
||||
res.upgrade_schema(reader.schema());
|
||||
return res;
|
||||
}
|
||||
|
||||
// Reader which populates the cache using data from the delegate.
|
||||
@@ -947,7 +944,6 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
});
|
||||
|
||||
return seastar::async([this, &m, updater = std::move(updater), real_dirty_acc = std::move(real_dirty_acc)] () mutable {
|
||||
coroutine update;
|
||||
size_t size_entry;
|
||||
// In case updater fails, we must bring the cache to consistency without deferring.
|
||||
auto cleanup = defer([&m, this] {
|
||||
@@ -955,6 +951,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
_prev_snapshot_pos = {};
|
||||
_prev_snapshot = {};
|
||||
});
|
||||
coroutine update; // Destroy before cleanup to release snapshots before invalidating.
|
||||
partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
|
||||
while (!m.partitions.empty()) {
|
||||
with_allocator(_tracker.allocator(), [&] () {
|
||||
@@ -1026,8 +1023,10 @@ future<> row_cache::update(external_updater eu, memtable& m) {
|
||||
if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
|
||||
cache_entry& entry = *cache_i;
|
||||
upgrade_entry(entry);
|
||||
assert(entry._schema == _schema);
|
||||
_tracker.on_partition_merge();
|
||||
return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
|
||||
mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
|
||||
return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
|
||||
alloc, _tracker.region(), _tracker, _underlying_phase, acc);
|
||||
} else if (cache_i->continuous()
|
||||
|| with_allocator(standard_allocator(), [&] { return is_present(mem_e.key()); })
|
||||
@@ -1039,7 +1038,8 @@ future<> row_cache::update(external_updater eu, memtable& m) {
|
||||
entry->set_continuous(cache_i->continuous());
|
||||
_tracker.insert(*entry);
|
||||
_partitions.insert_before(cache_i, *entry);
|
||||
return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
|
||||
mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
|
||||
return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
|
||||
alloc, _tracker.region(), _tracker, _underlying_phase, acc);
|
||||
} else {
|
||||
return make_empty_coroutine();
|
||||
@@ -1136,8 +1136,8 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
|
||||
});
|
||||
}
|
||||
|
||||
void row_cache::evict(const dht::partition_range& range) {
|
||||
invalidate_unwrapped(range);
|
||||
void row_cache::evict() {
|
||||
while (_tracker.region().evict_some() == memory::reclaiming_result::reclaimed_something) {}
|
||||
}
|
||||
|
||||
void row_cache::invalidate_unwrapped(const dht::partition_range& range) {
|
||||
@@ -1224,8 +1224,11 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
|
||||
partition_version& pv = partition_version::container_of(mutation_partition::container_of(
|
||||
mutation_partition::rows_type::container_of_only_member(*it)));
|
||||
if (pv.is_referenced_from_entry()) {
|
||||
cache_entry& ce = cache_entry::container_of(partition_entry::container_of(pv));
|
||||
ce.on_evicted(tracker);
|
||||
partition_entry& pe = partition_entry::container_of(pv);
|
||||
if (!pe.is_locked()) {
|
||||
cache_entry& ce = cache_entry::container_of(pe);
|
||||
ce.on_evicted(tracker);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1246,13 +1249,12 @@ flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
|
||||
auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase());
|
||||
auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
|
||||
auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
|
||||
if (reader.schema()->version() != _schema->version()) {
|
||||
r = transform(std::move(r), schema_upgrader(reader.schema()));
|
||||
}
|
||||
if (reader.fwd() == streamed_mutation::forwarding::yes) {
|
||||
r = make_forwardable(std::move(r));
|
||||
}
|
||||
return std::move(r);
|
||||
r.upgrade_schema(rc.schema());
|
||||
r.upgrade_schema(reader.schema());
|
||||
return r;
|
||||
}
|
||||
|
||||
const schema_ptr& row_cache::schema() const {
|
||||
@@ -1260,7 +1262,7 @@ const schema_ptr& row_cache::schema() const {
|
||||
}
|
||||
|
||||
void row_cache::upgrade_entry(cache_entry& e) {
|
||||
if (e._schema != _schema) {
|
||||
if (e._schema != _schema && !e.partition().is_locked()) {
|
||||
auto& r = _tracker.region();
|
||||
assert(!r.reclaiming_enabled());
|
||||
with_allocator(r.allocator(), [this, &e] {
|
||||
|
||||
@@ -549,12 +549,12 @@ public:
|
||||
future<> invalidate(external_updater, const dht::partition_range& = query::full_partition_range);
|
||||
future<> invalidate(external_updater, dht::partition_range_vector&&);
|
||||
|
||||
// Evicts entries from given range in cache.
|
||||
// Evicts entries from cache.
|
||||
//
|
||||
// Note that this does not synchronize with the underlying source,
|
||||
// it is assumed that the underlying source didn't change.
|
||||
// If it did, use invalidate() instead.
|
||||
void evict(const dht::partition_range& = query::full_partition_range);
|
||||
void evict();
|
||||
|
||||
size_t partitions() const {
|
||||
return _partitions.size();
|
||||
|
||||
@@ -59,7 +59,7 @@ def sh_command(*args):
|
||||
|
||||
def get_json_from_url(path):
|
||||
data = sh_command("curl", "-s", "-X", "GET", path)
|
||||
return json.loads(data)
|
||||
return json.loads(data.decode('utf-8'))
|
||||
|
||||
def get_api(path):
|
||||
return get_json_from_url("http://" + api_address + path)
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 522627756d...083dc0875e
@@ -28,11 +28,25 @@
|
||||
namespace service {
|
||||
|
||||
class cache_hitrate_calculator : public seastar::async_sharded_service<cache_hitrate_calculator> {
|
||||
struct stat {
|
||||
float h = 0;
|
||||
float m = 0;
|
||||
stat& operator+=(stat& o) {
|
||||
h += o.h;
|
||||
m += o.m;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
seastar::sharded<database>& _db;
|
||||
seastar::sharded<cache_hitrate_calculator>& _me;
|
||||
timer<lowres_clock> _timer;
|
||||
bool _stopped = false;
|
||||
float _diff = 0;
|
||||
std::unordered_map<utils::UUID, stat> _rates;
|
||||
size_t _slen = 0;
|
||||
std::string _gstate;
|
||||
future<> _done = make_ready_future();
|
||||
|
||||
future<lowres_clock::duration> recalculate_hitrates();
|
||||
void recalculate_timer();
|
||||
|
||||
@@ -181,7 +181,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::permission p
|
||||
for (auto cf : { db::system_keyspace::LOCAL, db::system_keyspace::PEERS }) {
|
||||
tmp.insert(auth::make_data_resource(db::system_keyspace::NAME, cf));
|
||||
}
|
||||
for (auto cf : db::schema_tables::ALL) {
|
||||
for (auto cf : db::schema_tables::all_table_names()) {
|
||||
tmp.insert(auth::make_data_resource(db::schema_tables::NAME, cf));
|
||||
}
|
||||
return tmp;
|
||||
|
||||
@@ -204,6 +204,10 @@ future<> migration_manager::maybe_schedule_schema_pull(const utils::UUID& their_
|
||||
return make_ready_future<>();
|
||||
}
|
||||
const auto* value = ep_state->get_application_state_ptr(gms::application_state::SCHEMA);
|
||||
if (!value) {
|
||||
mlogger.debug("application_state::SCHEMA does not exist for {}, not submitting migration task", endpoint);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
utils::UUID current_version{value->value};
|
||||
auto& db = proxy.get_db().local();
|
||||
if (db.get_version() == current_version) {
|
||||
@@ -529,6 +533,10 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, api::time
|
||||
if (db.has_schema(cfm->ks_name(), cfm->cf_name())) {
|
||||
throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
|
||||
}
|
||||
if (db.column_family_exists(cfm->id())) {
|
||||
throw exceptions::invalid_request_exception(sprint("Table with ID %s already exists: %s", cfm->id(), db.find_schema(cfm->id())));
|
||||
}
|
||||
|
||||
mlogger.info("Create new ColumnFamily: {}", cfm);
|
||||
return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, timestamp)
|
||||
.then([announce_locally, this] (auto&& mutations) {
|
||||
|
||||
@@ -92,7 +92,7 @@ cache_hitrate_calculator::cache_hitrate_calculator(seastar::sharded<database>& d
|
||||
{}
|
||||
|
||||
void cache_hitrate_calculator::recalculate_timer() {
|
||||
recalculate_hitrates().then_wrapped([p = shared_from_this()] (future<lowres_clock::duration> f) {
|
||||
_done = recalculate_hitrates().then_wrapped([p = shared_from_this()] (future<lowres_clock::duration> f) {
|
||||
lowres_clock::duration d;
|
||||
if (f.failed()) {
|
||||
d = std::chrono::milliseconds(2000);
|
||||
@@ -112,21 +112,11 @@ void cache_hitrate_calculator::run_on(size_t master, lowres_clock::duration d) {
|
||||
}
|
||||
|
||||
future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates() {
|
||||
struct stat {
|
||||
float h = 0;
|
||||
float m = 0;
|
||||
stat& operator+=(stat& o) {
|
||||
h += o.h;
|
||||
m += o.m;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
static auto non_system_filter = [&] (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
|
||||
auto non_system_filter = [&] (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
|
||||
return _db.local().find_keyspace(cf.second->schema()->ks_name()).get_replication_strategy().get_type() != locator::replication_strategy_type::local;
|
||||
};
|
||||
|
||||
auto cf_to_cache_hit_stats = [] (database& db) {
|
||||
auto cf_to_cache_hit_stats = [non_system_filter] (database& db) {
|
||||
return boost::copy_range<std::unordered_map<utils::UUID, stat>>(db.get_column_families() | boost::adaptors::filtered(non_system_filter) |
|
||||
boost::adaptors::transformed([] (const std::pair<utils::UUID, lw_shared_ptr<column_family>>& cf) {
|
||||
auto& stats = cf.second->get_row_cache().stats();
|
||||
@@ -141,17 +131,20 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
|
||||
return std::move(a);
|
||||
};
|
||||
|
||||
return _db.map_reduce0(cf_to_cache_hit_stats, std::unordered_map<utils::UUID, stat>(), sum_stats_per_cf).then([this] (std::unordered_map<utils::UUID, stat> rates) mutable {
|
||||
return _db.map_reduce0(cf_to_cache_hit_stats, std::unordered_map<utils::UUID, stat>(), sum_stats_per_cf).then([this, non_system_filter] (std::unordered_map<utils::UUID, stat> rates) mutable {
|
||||
_diff = 0;
|
||||
_gstate.reserve(_slen); // assume length did not change from previous iteration
|
||||
_slen = 0;
|
||||
_rates = std::move(rates);
|
||||
// set calculated rates on all shards
|
||||
return _db.invoke_on_all([this, rates = std::move(rates), cpuid = engine().cpu_id()] (database& db) {
|
||||
sstring gstate;
|
||||
for (auto& cf : db.get_column_families() | boost::adaptors::filtered(non_system_filter)) {
|
||||
auto it = rates.find(cf.first);
|
||||
if (it == rates.end()) { // a table may be added before map/reduce compltes and this code runs
|
||||
continue;
|
||||
return _db.invoke_on_all([this, cpuid = engine().cpu_id(), non_system_filter] (database& db) {
|
||||
return do_for_each(_rates, [this, cpuid, &db] (auto&& r) mutable {
|
||||
auto it = db.get_column_families().find(r.first);
|
||||
if (it == db.get_column_families().end()) { // a table may be added before map/reduce completes and this code runs
|
||||
return;
|
||||
}
|
||||
stat s = it->second;
|
||||
auto& cf = *it;
|
||||
stat& s = r.second;
|
||||
float rate = 0;
|
||||
if (s.h) {
|
||||
rate = s.h / (s.h + s.m);
|
||||
@@ -159,31 +152,33 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
|
||||
if (engine().cpu_id() == cpuid) {
|
||||
// calculate max difference between old rate and new one for all cfs
|
||||
_diff = std::max(_diff, std::abs(float(cf.second->get_global_cache_hit_rate()) - rate));
|
||||
gstate += sprint("%s.%s:%f;", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
|
||||
_gstate += sprint("%s.%s:%.6f;", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
|
||||
}
|
||||
cf.second->set_global_cache_hit_rate(cache_temperature(rate));
|
||||
}
|
||||
if (gstate.size()) {
|
||||
auto& g = gms::get_local_gossiper();
|
||||
auto& ss = get_local_storage_service();
|
||||
return g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(std::move(gstate)));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
});
|
||||
}).then([this] {
|
||||
// if max difference during this round is big schedule next recalculate earlier
|
||||
if (_diff < 0.01) {
|
||||
return std::chrono::milliseconds(2000);
|
||||
} else {
|
||||
return std::chrono::milliseconds(500);
|
||||
}
|
||||
auto& g = gms::get_local_gossiper();
|
||||
auto& ss = get_local_storage_service();
|
||||
_slen = _gstate.size();
|
||||
return g.add_local_application_state(gms::application_state::CACHE_HITRATES, ss.value_factory.cache_hitrates(_gstate)).then([this] {
|
||||
// if max difference during this round is big schedule next recalculate earlier
|
||||
if (_diff < 0.01) {
|
||||
return std::chrono::milliseconds(2000);
|
||||
} else {
|
||||
return std::chrono::milliseconds(500);
|
||||
}
|
||||
});
|
||||
}).finally([this] {
|
||||
_gstate = std::string(); // free memory, do not trust clear() to do that for string
|
||||
_rates.clear();
|
||||
});
|
||||
}
|
||||
|
||||
future<> cache_hitrate_calculator::stop() {
|
||||
_timer.cancel();
|
||||
_stopped = true;
|
||||
return make_ready_future<>();
|
||||
return std::move(_done);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -382,7 +382,7 @@ public:
|
||||
std::chrono::microseconds calculate_delay(db::view::update_backlog backlog) {
|
||||
constexpr auto delay_limit_us = 1000000;
|
||||
auto adjust = [] (float x) { return x * x * x; };
|
||||
auto budget = std::min(std::chrono::microseconds(0), std::chrono::microseconds(_expire_timer.get_timeout() - storage_proxy::clock_type::now()));
|
||||
auto budget = std::max(std::chrono::microseconds(0), std::chrono::microseconds(_expire_timer.get_timeout() - storage_proxy::clock_type::now()));
|
||||
return std::min(
|
||||
budget,
|
||||
std::chrono::microseconds(uint32_t(adjust(backlog.relative_size()) * delay_limit_us)));
|
||||
@@ -616,9 +616,7 @@ void storage_proxy::maybe_update_view_backlog_of(gms::inet_address replica, stdx
|
||||
}
|
||||
|
||||
db::view::update_backlog storage_proxy::get_view_update_backlog() const {
|
||||
auto memory_backlog = get_db().local().get_view_update_backlog();
|
||||
auto hints_backlog = db::view::update_backlog{_hints_for_views_manager.backlog_size(), _hints_for_views_manager.max_backlog_size()};
|
||||
return _max_view_update_backlog.add_fetch(engine().cpu_id(), std::max(memory_backlog, hints_backlog));
|
||||
return _max_view_update_backlog.add_fetch(engine().cpu_id(), get_db().local().get_view_update_backlog());
|
||||
}
|
||||
|
||||
db::view::update_backlog storage_proxy::get_backlog_of(gms::inet_address ep) const {
|
||||
@@ -1449,6 +1447,22 @@ future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, d
|
||||
stdx::optional<clock_type::time_point> timeout_opt) {
|
||||
return parallel_for_each(ids, [this, cl, timeout_opt] (unique_response_handler& protected_response) {
|
||||
auto response_id = protected_response.id;
|
||||
// This function, mutate_begin(), is called after a preemption point
|
||||
// so it's possible that other code besides our caller just ran. In
|
||||
// particular, Scylla may have noticed that a remote node went down,
|
||||
// called storage_proxy::on_down(), and removed some of the ongoing
|
||||
// handlers, including this id. If this happens, we need to ignore
|
||||
// this id - not try to look it up or start a send.
|
||||
if (_response_handlers.find(response_id) == _response_handlers.end()) {
|
||||
protected_response.release(); // Don't try to remove this id again
|
||||
// Requests that time-out normally below after response_wait()
|
||||
// result in an exception (see ~abstract_write_response_handler())
|
||||
// However, here we no longer have the handler or its information
|
||||
// to put in the exception. The exception is not needed for
|
||||
// correctness (e.g., hints are written by timeout_cb(), not
|
||||
// because of an exception here).
|
||||
return make_exception_future<>(std::runtime_error("unstarted write cancelled"));
|
||||
}
|
||||
// it is better to send first and hint afterwards to reduce latency
|
||||
// but request may complete before hint_to_dead_endpoints() is called and
|
||||
// response_id handler will be removed, so we will have to do hint with separate
|
||||
@@ -2737,8 +2751,8 @@ public:
|
||||
|
||||
// build reconcilable_result from reconciled data
|
||||
// traverse backwards since large keys are at the start
|
||||
std::vector<partition> vec;
|
||||
auto r = boost::accumulate(reconciled_partitions | boost::adaptors::reversed, std::ref(vec), [] (std::vector<partition>& a, const mutation_and_live_row_count& m_a_rc) {
|
||||
utils::chunked_vector<partition> vec;
|
||||
auto r = boost::accumulate(reconciled_partitions | boost::adaptors::reversed, std::ref(vec), [] (utils::chunked_vector<partition>& a, const mutation_and_live_row_count& m_a_rc) {
|
||||
a.emplace_back(partition(m_a_rc.live_row_count, freeze(m_a_rc.mut)));
|
||||
return std::ref(a);
|
||||
});
|
||||
|
||||
@@ -101,6 +101,7 @@ static const sstring ROLES_FEATURE = "ROLES";
|
||||
static const sstring LA_SSTABLE_FEATURE = "LA_SSTABLE_FORMAT";
|
||||
static const sstring STREAM_WITH_RPC_STREAM = "STREAM_WITH_RPC_STREAM";
|
||||
static const sstring MC_SSTABLE_FEATURE = "MC_SSTABLE_FORMAT";
|
||||
static const sstring CORRECT_STATIC_COMPACT_IN_MC = "CORRECT_STATIC_COMPACT_IN_MC";
|
||||
|
||||
distributed<storage_service> _the_storage_service;
|
||||
|
||||
@@ -124,9 +125,27 @@ int get_generation_number() {
|
||||
return generation_number;
|
||||
}
|
||||
|
||||
storage_service::storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks)
|
||||
: _db(db)
|
||||
storage_service::storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
gms::feature_service& feature_service)
|
||||
: _feature_service(feature_service)
|
||||
, _db(db)
|
||||
, _auth_service(auth_service)
|
||||
, _range_tombstones_feature(_feature_service, RANGE_TOMBSTONES_FEATURE)
|
||||
, _large_partitions_feature(_feature_service, LARGE_PARTITIONS_FEATURE)
|
||||
, _materialized_views_feature(_feature_service, MATERIALIZED_VIEWS_FEATURE)
|
||||
, _counters_feature(_feature_service, COUNTERS_FEATURE)
|
||||
, _indexes_feature(_feature_service, INDEXES_FEATURE)
|
||||
, _digest_multipartition_read_feature(_feature_service, DIGEST_MULTIPARTITION_READ_FEATURE)
|
||||
, _correct_counter_order_feature(_feature_service, CORRECT_COUNTER_ORDER_FEATURE)
|
||||
, _schema_tables_v3(_feature_service, SCHEMA_TABLES_V3)
|
||||
, _correct_non_compound_range_tombstones(_feature_service, CORRECT_NON_COMPOUND_RANGE_TOMBSTONES)
|
||||
, _write_failure_reply_feature(_feature_service, WRITE_FAILURE_REPLY_FEATURE)
|
||||
, _xxhash_feature(_feature_service, XXHASH_FEATURE)
|
||||
, _roles_feature(_feature_service, ROLES_FEATURE)
|
||||
, _la_sstable_feature(_feature_service, LA_SSTABLE_FEATURE)
|
||||
, _stream_with_rpc_stream_feature(_feature_service, STREAM_WITH_RPC_STREAM)
|
||||
, _mc_sstable_feature(_feature_service, MC_SSTABLE_FEATURE)
|
||||
, _correct_static_compact_in_mc(_feature_service, CORRECT_STATIC_COMPACT_IN_MC)
|
||||
, _replicate_action([this] { return do_replicate_to_all_cores(); })
|
||||
, _update_pending_ranges_action([this] { return do_update_pending_ranges(); })
|
||||
, _sys_dist_ks(sys_dist_ks) {
|
||||
@@ -137,6 +156,25 @@ storage_service::storage_service(distributed<database>& db, sharded<auth::servic
|
||||
commit_error.connect([this] { isolate_on_commit_error(); });
|
||||
}
|
||||
|
||||
void storage_service::enable_all_features() {
|
||||
_range_tombstones_feature.enable();
|
||||
_large_partitions_feature.enable();
|
||||
_materialized_views_feature.enable();
|
||||
_counters_feature.enable();
|
||||
_indexes_feature.enable();
|
||||
_digest_multipartition_read_feature.enable();
|
||||
_correct_counter_order_feature.enable();
|
||||
_schema_tables_v3.enable();
|
||||
_correct_non_compound_range_tombstones.enable();
|
||||
_write_failure_reply_feature.enable();
|
||||
_xxhash_feature.enable();
|
||||
_roles_feature.enable();
|
||||
_la_sstable_feature.enable();
|
||||
_stream_with_rpc_stream_feature.enable();
|
||||
_mc_sstable_feature.enable();
|
||||
_correct_static_compact_in_mc.enable();
|
||||
}
|
||||
|
||||
enum class node_external_status {
|
||||
UNKNOWN = 0,
|
||||
STARTING = 1,
|
||||
@@ -210,7 +248,8 @@ sstring storage_service::get_config_supported_features() {
|
||||
LA_SSTABLE_FEATURE,
|
||||
STREAM_WITH_RPC_STREAM,
|
||||
MATERIALIZED_VIEWS_FEATURE,
|
||||
INDEXES_FEATURE
|
||||
INDEXES_FEATURE,
|
||||
CORRECT_STATIC_COMPACT_IN_MC,
|
||||
};
|
||||
auto& config = service::get_local_storage_service()._db.local().get_config();
|
||||
if (config.enable_sstables_mc_format()) {
|
||||
@@ -276,7 +315,7 @@ bool storage_service::should_bootstrap() {
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints, bind_messaging_port do_bind) {
|
||||
void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints, const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, bind_messaging_port do_bind) {
|
||||
if (_joined) {
|
||||
return;
|
||||
}
|
||||
@@ -306,25 +345,20 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
if (!is_auto_bootstrap()) {
|
||||
throw std::runtime_error("Trying to replace_address with auto_bootstrap disabled will not work, check your configuration");
|
||||
}
|
||||
_bootstrap_tokens = prepare_replacement_info().get0();
|
||||
_bootstrap_tokens = prepare_replacement_info(loaded_peer_features).get0();
|
||||
app_states.emplace(gms::application_state::TOKENS, value_factory.tokens(_bootstrap_tokens));
|
||||
app_states.emplace(gms::application_state::STATUS, value_factory.hibernate(true));
|
||||
} else if (should_bootstrap()) {
|
||||
check_for_endpoint_collision().get();
|
||||
check_for_endpoint_collision(loaded_peer_features).get();
|
||||
} else {
|
||||
auto& gossiper = gms::get_local_gossiper();
|
||||
auto seeds = gms::get_local_gossiper().get_seeds();
|
||||
auto my_ep = get_broadcast_address();
|
||||
auto peer_features = db::system_keyspace::load_peer_features().get0();
|
||||
slogger.info("load_peer_features: peer_features size={}", peer_features.size());
|
||||
for (auto& x : peer_features) {
|
||||
slogger.info("load_peer_features: peer={}, supported_features={}", x.first, x.second);
|
||||
}
|
||||
auto local_features = get_config_supported_features();
|
||||
|
||||
if (seeds.count(my_ep)) {
|
||||
// This node is a seed node
|
||||
if (peer_features.empty()) {
|
||||
if (loaded_peer_features.empty()) {
|
||||
// This is a competely new seed node, skip the check
|
||||
slogger.info("Checking remote features skipped, since this node is a new seed node which knows nothing about the cluster");
|
||||
} else {
|
||||
@@ -332,7 +366,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
if (seeds.size() == 1) {
|
||||
// This node is the only seed node, check features with system table
|
||||
slogger.info("Checking remote features with system table, since this node is the only seed node");
|
||||
gossiper.check_knows_remote_features(local_features, peer_features);
|
||||
gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
} else {
|
||||
// More than one seed node in the seed list, do shadow round with other seed nodes
|
||||
bool ok;
|
||||
@@ -347,11 +381,11 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
}
|
||||
|
||||
if (ok) {
|
||||
gossiper.check_knows_remote_features(local_features);
|
||||
gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
} else {
|
||||
// Check features with system table
|
||||
slogger.info("Checking remote features with gossip failed, fallback to check with system table");
|
||||
gossiper.check_knows_remote_features(local_features, peer_features);
|
||||
gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
}
|
||||
|
||||
gossiper.reset_endpoint_state_map().get();
|
||||
@@ -367,7 +401,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
// (missing features) to join the cluser.
|
||||
slogger.info("Checking remote features with gossip");
|
||||
gossiper.do_shadow_round().get();
|
||||
gossiper.check_knows_remote_features(local_features);
|
||||
gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
gossiper.reset_endpoint_state_map().get();
|
||||
for (auto ep : loaded_endpoints) {
|
||||
gossiper.add_saved_endpoint(ep);
|
||||
@@ -375,6 +409,14 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
}
|
||||
}
|
||||
|
||||
// If this is a restarting node, we should update tokens before gossip starts
|
||||
auto my_tokens = db::system_keyspace::get_saved_tokens().get0();
|
||||
bool restarting_normal_node = db::system_keyspace::bootstrap_complete() && !db().local().is_replacing() && !my_tokens.empty();
|
||||
if (restarting_normal_node) {
|
||||
slogger.info("Restarting a node in NORMAL status");
|
||||
_token_metadata.update_normal_tokens(my_tokens, get_broadcast_address());
|
||||
}
|
||||
|
||||
// have to start the gossip service before we can see any info on other nodes. this is necessary
|
||||
// for bootstrap to get the load info it needs.
|
||||
// (we won't be part of the storage ring though until we add a counterId to our state, below.)
|
||||
@@ -385,6 +427,12 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
}).get();
|
||||
auto features = get_config_supported_features();
|
||||
_token_metadata.update_host_id(local_host_id, get_broadcast_address());
|
||||
|
||||
// Replicate the tokens early because once gossip runs other nodes
|
||||
// might send reads/writes to this node. Replicate it early to make
|
||||
// sure the tokens are valid on all the shards.
|
||||
replicate_to_all_cores().get();
|
||||
|
||||
auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
|
||||
app_states.emplace(gms::application_state::NET_VERSION, value_factory.network_version());
|
||||
app_states.emplace(gms::application_state::HOST_ID, value_factory.host_id(local_host_id));
|
||||
@@ -395,6 +443,10 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
app_states.emplace(gms::application_state::SCHEMA_TABLES_VERSION, versioned_value(db::schema_tables::version));
|
||||
app_states.emplace(gms::application_state::RPC_READY, value_factory.cql_ready(false));
|
||||
app_states.emplace(gms::application_state::VIEW_BACKLOG, versioned_value(""));
|
||||
if (restarting_normal_node) {
|
||||
app_states.emplace(gms::application_state::TOKENS, value_factory.tokens(my_tokens));
|
||||
app_states.emplace(gms::application_state::STATUS, value_factory.normal(my_tokens));
|
||||
}
|
||||
slogger.info("Starting up server gossip");
|
||||
|
||||
auto& gossiper = gms::get_local_gossiper();
|
||||
@@ -408,9 +460,6 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
auto& proxy = service::get_storage_proxy();
|
||||
// gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
|
||||
update_schema_version_and_announce(proxy).get();// Ensure we know our own actual Schema UUID in preparation for updates
|
||||
get_storage_service().invoke_on_all([] (auto& ss) {
|
||||
ss.register_features();
|
||||
}).get();
|
||||
#if 0
|
||||
if (!MessagingService.instance().isListening())
|
||||
MessagingService.instance().listen(FBUtilities.getLocalAddress());
|
||||
@@ -419,24 +468,10 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
HintedHandOffManager.instance.start();
|
||||
BatchlogManager.instance.start();
|
||||
#endif
|
||||
}
|
||||
|
||||
void storage_service::register_features() {
|
||||
_range_tombstones_feature = gms::feature(RANGE_TOMBSTONES_FEATURE);
|
||||
_large_partitions_feature = gms::feature(LARGE_PARTITIONS_FEATURE);
|
||||
_counters_feature = gms::feature(COUNTERS_FEATURE);
|
||||
_digest_multipartition_read_feature = gms::feature(DIGEST_MULTIPARTITION_READ_FEATURE);
|
||||
_correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);
|
||||
_schema_tables_v3 = gms::feature(SCHEMA_TABLES_V3);
|
||||
_correct_non_compound_range_tombstones = gms::feature(CORRECT_NON_COMPOUND_RANGE_TOMBSTONES);
|
||||
_write_failure_reply_feature = gms::feature(WRITE_FAILURE_REPLY_FEATURE);
|
||||
_xxhash_feature = gms::feature(XXHASH_FEATURE);
|
||||
_roles_feature = gms::feature(ROLES_FEATURE);
|
||||
_la_sstable_feature = gms::feature(LA_SSTABLE_FEATURE);
|
||||
_stream_with_rpc_stream_feature = gms::feature(STREAM_WITH_RPC_STREAM);
|
||||
_mc_sstable_feature = gms::feature(MC_SSTABLE_FEATURE);
|
||||
_materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
|
||||
_indexes_feature = gms::feature(INDEXES_FEATURE);
|
||||
// Wait for gossip to settle so that the fetures will be enabled
|
||||
if (do_bind) {
|
||||
gms::get_local_gossiper().wait_for_gossip_to_settle().get();
|
||||
}
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
@@ -478,13 +513,9 @@ void storage_service::join_token_ring(int delay) {
|
||||
db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::IN_PROGRESS).get();
|
||||
}
|
||||
set_mode(mode::JOINING, "waiting for ring information", true);
|
||||
// first sleep the delay to make sure we see all our peers
|
||||
for (int i = 0; i < delay; i += 1000) {
|
||||
// if we see schema, we can proceed to the next check directly
|
||||
if (_db.local().get_version() != database::empty_version) {
|
||||
slogger.debug("got schema: {}", _db.local().get_version());
|
||||
break;
|
||||
}
|
||||
auto& gossiper = gms::get_gossiper().local();
|
||||
// first sleep the delay to make sure we see *at least* one other node
|
||||
for (int i = 0; i < delay && gossiper.get_live_members().size() < 2; i += 1000) {
|
||||
sleep(std::chrono::seconds(1)).get();
|
||||
}
|
||||
// if our schema hasn't matched yet, keep sleeping until it does
|
||||
@@ -541,7 +572,6 @@ void storage_service::join_token_ring(int delay) {
|
||||
for (auto token : _bootstrap_tokens) {
|
||||
auto existing = _token_metadata.get_endpoint(token);
|
||||
if (existing) {
|
||||
auto& gossiper = gms::get_local_gossiper();
|
||||
auto* eps = gossiper.get_endpoint_state_for_endpoint_ptr(*existing);
|
||||
if (eps && eps->get_update_timestamp() > gms::gossiper::clk::now() - std::chrono::milliseconds(delay)) {
|
||||
throw std::runtime_error("Cannot replace a live node...");
|
||||
@@ -685,6 +715,7 @@ void storage_service::bootstrap(std::unordered_set<token> tokens) {
|
||||
} else {
|
||||
// Dont set any state for the node which is bootstrapping the existing token...
|
||||
_token_metadata.update_normal_tokens(tokens, get_broadcast_address());
|
||||
replicate_to_all_cores().get();
|
||||
auto replace_addr = db().local().get_replace_address();
|
||||
if (replace_addr) {
|
||||
slogger.debug("Removing replaced endpoint {} from system.peers", *replace_addr);
|
||||
@@ -1441,7 +1472,13 @@ future<> storage_service::init_server(int delay, bind_messaging_port do_bind) {
|
||||
}
|
||||
}
|
||||
|
||||
prepare_to_join(std::move(loaded_endpoints), do_bind);
|
||||
auto loaded_peer_features = db::system_keyspace::load_peer_features().get0();
|
||||
slogger.info("loaded_peer_features: peer_features size={}", loaded_peer_features.size());
|
||||
for (auto& x : loaded_peer_features) {
|
||||
slogger.info("loaded_peer_features: peer={}, supported_features={}", x.first, x.second);
|
||||
}
|
||||
|
||||
prepare_to_join(std::move(loaded_endpoints), loaded_peer_features, do_bind);
|
||||
#if 0
|
||||
// Has to be called after the host id has potentially changed in prepareToJoin().
|
||||
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
|
||||
@@ -1455,6 +1492,7 @@ future<> storage_service::init_server(int delay, bind_messaging_port do_bind) {
|
||||
auto tokens = db::system_keyspace::get_saved_tokens().get0();
|
||||
if (!tokens.empty()) {
|
||||
_token_metadata.update_normal_tokens(tokens, get_broadcast_address());
|
||||
replicate_to_all_cores().get();
|
||||
// order is important here, the gossiper can fire in between adding these two states. It's ok to send TOKENS without STATUS, but *not* vice versa.
|
||||
gossiper.add_local_application_state({
|
||||
{ gms::application_state::TOKENS, value_factory.tokens(tokens) },
|
||||
@@ -1518,20 +1556,21 @@ future<> storage_service::stop() {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> storage_service::check_for_endpoint_collision() {
|
||||
future<> storage_service::check_for_endpoint_collision(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features) {
|
||||
slogger.debug("Starting shadow gossip round to check for endpoint collision");
|
||||
#if 0
|
||||
if (!MessagingService.instance().isListening())
|
||||
MessagingService.instance().listen(FBUtilities.getLocalAddress());
|
||||
#endif
|
||||
return seastar::async([this] {
|
||||
return seastar::async([this, loaded_peer_features] {
|
||||
auto& gossiper = gms::get_local_gossiper();
|
||||
auto t = gms::gossiper::clk::now();
|
||||
bool found_bootstrapping_node = false;
|
||||
auto local_features = get_config_supported_features();
|
||||
do {
|
||||
slogger.info("Checking remote features with gossip");
|
||||
gossiper.do_shadow_round().get();
|
||||
gossiper.check_knows_remote_features(get_config_supported_features());
|
||||
gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
auto addr = get_broadcast_address();
|
||||
if (!gossiper.is_safe_for_bootstrap(addr)) {
|
||||
throw std::runtime_error(sprint("A node with address %s already exists, cancelling join. "
|
||||
@@ -1583,7 +1622,7 @@ void storage_service::remove_endpoint(inet_address endpoint) {
|
||||
}).get();
|
||||
}
|
||||
|
||||
future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
|
||||
future<std::unordered_set<token>> storage_service::prepare_replacement_info(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features) {
|
||||
if (!db().local().get_replace_address()) {
|
||||
throw std::runtime_error(sprint("replace_address is empty"));
|
||||
}
|
||||
@@ -1599,9 +1638,10 @@ future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
|
||||
|
||||
// make magic happen
|
||||
slogger.info("Checking remote features with gossip");
|
||||
return gms::get_local_gossiper().do_shadow_round().then([this, replace_address] {
|
||||
return gms::get_local_gossiper().do_shadow_round().then([this, loaded_peer_features, replace_address] {
|
||||
auto& gossiper = gms::get_local_gossiper();
|
||||
gossiper.check_knows_remote_features(get_config_supported_features());
|
||||
auto local_features = get_config_supported_features();
|
||||
gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
// now that we've gossiped at least once, we should be able to find the node we're replacing
|
||||
auto* state = gossiper.get_endpoint_state_for_endpoint_ptr(replace_address);
|
||||
if (!state) {
|
||||
@@ -2106,6 +2146,7 @@ future<> storage_service::start_native_transport() {
|
||||
auto cred = std::make_shared<seastar::tls::credentials_builder>();
|
||||
|
||||
cred->set_dh_level(seastar::tls::dh_params::level::MEDIUM);
|
||||
cred->set_priority_string(db::config::default_tls_priority);
|
||||
|
||||
if (ceo.count("priority_string")) {
|
||||
cred->set_priority_string(ceo.at("priority_string"));
|
||||
@@ -2466,15 +2507,17 @@ future<> storage_service::rebuild(sstring source_dc) {
|
||||
if (source_dc != "") {
|
||||
streamer->add_source_filter(std::make_unique<dht::range_streamer::single_datacenter_filter>(source_dc));
|
||||
}
|
||||
for (const auto& keyspace_name : ss._db.local().get_non_system_keyspaces()) {
|
||||
streamer->add_ranges(keyspace_name, ss.get_local_ranges(keyspace_name));
|
||||
}
|
||||
return streamer->stream_async().then([streamer] {
|
||||
slogger.info("Streaming for rebuild successful");
|
||||
}).handle_exception([] (auto ep) {
|
||||
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
|
||||
slogger.warn("Error while rebuilding node: {}", std::current_exception());
|
||||
return make_exception_future<>(std::move(ep));
|
||||
auto keyspaces = make_lw_shared<std::vector<sstring>>(ss._db.local().get_non_system_keyspaces());
|
||||
return do_for_each(*keyspaces, [keyspaces, streamer, &ss] (sstring& keyspace_name) {
|
||||
return streamer->add_ranges(keyspace_name, ss.get_local_ranges(keyspace_name));
|
||||
}).then([streamer] {
|
||||
return streamer->stream_async().then([streamer] {
|
||||
slogger.info("Streaming for rebuild successful");
|
||||
}).handle_exception([] (auto ep) {
|
||||
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
|
||||
slogger.warn("Error while rebuilding node: {}", std::current_exception());
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -3306,5 +3349,14 @@ void storage_service::notify_cql_change(inet_address endpoint, bool ready)
|
||||
}
|
||||
}
|
||||
|
||||
future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
sharded<gms::feature_service>& feature_service) {
|
||||
return service::get_storage_service().start(std::ref(db), std::ref(auth_service), std::ref(sys_dist_ks), std::ref(feature_service));
|
||||
}
|
||||
|
||||
future<> deinit_storage_service() {
|
||||
return service::get_storage_service().stop();
|
||||
}
|
||||
|
||||
} // namespace service
|
||||
|
||||
|
||||
@@ -71,6 +71,10 @@ namespace dht {
|
||||
class boot_strapper;
|
||||
}
|
||||
|
||||
namespace gms {
|
||||
class feature_service;
|
||||
};
|
||||
|
||||
namespace service {
|
||||
|
||||
class load_broadcaster;
|
||||
@@ -120,6 +124,7 @@ private:
|
||||
/* JMX notification serial number counter */
|
||||
private final AtomicLong notificationSerialNumber = new AtomicLong();
|
||||
#endif
|
||||
gms::feature_service& _feature_service;
|
||||
distributed<database>& _db;
|
||||
sharded<auth::service>& _auth_service;
|
||||
int _update_jobs{0};
|
||||
@@ -139,7 +144,7 @@ private:
|
||||
bool _stream_manager_stopped = false;
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
public:
|
||||
storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&);
|
||||
storage_service(distributed<database>& db, sharded<auth::service>&, sharded<db::system_distributed_keyspace>&, gms::feature_service& feature_service);
|
||||
void isolate_on_error();
|
||||
void isolate_on_commit_error();
|
||||
|
||||
@@ -290,24 +295,9 @@ private:
|
||||
gms::feature _la_sstable_feature;
|
||||
gms::feature _stream_with_rpc_stream_feature;
|
||||
gms::feature _mc_sstable_feature;
|
||||
gms::feature _correct_static_compact_in_mc;
|
||||
public:
|
||||
void enable_all_features() {
|
||||
_range_tombstones_feature.enable();
|
||||
_large_partitions_feature.enable();
|
||||
_materialized_views_feature.enable();
|
||||
_counters_feature.enable();
|
||||
_indexes_feature.enable();
|
||||
_digest_multipartition_read_feature.enable();
|
||||
_correct_counter_order_feature.enable();
|
||||
_schema_tables_v3.enable();
|
||||
_correct_non_compound_range_tombstones.enable();
|
||||
_write_failure_reply_feature.enable();
|
||||
_xxhash_feature.enable();
|
||||
_roles_feature.enable();
|
||||
_la_sstable_feature.enable();
|
||||
_stream_with_rpc_stream_feature.enable();
|
||||
_mc_sstable_feature.enable();
|
||||
}
|
||||
void enable_all_features();
|
||||
|
||||
void finish_bootstrapping() {
|
||||
_is_bootstrap_mode = false;
|
||||
@@ -400,9 +390,9 @@ public:
|
||||
}
|
||||
#endif
|
||||
public:
|
||||
future<std::unordered_set<token>> prepare_replacement_info();
|
||||
future<std::unordered_set<token>> prepare_replacement_info(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features);
|
||||
|
||||
future<> check_for_endpoint_collision();
|
||||
future<> check_for_endpoint_collision(const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features);
|
||||
#if 0
|
||||
|
||||
// for testing only
|
||||
@@ -464,8 +454,7 @@ public:
|
||||
#endif
|
||||
private:
|
||||
bool should_bootstrap();
|
||||
void prepare_to_join(std::vector<inet_address> loaded_endpoints, bind_messaging_port do_bind = bind_messaging_port::yes);
|
||||
void register_features();
|
||||
void prepare_to_join(std::vector<inet_address> loaded_endpoints, const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, bind_messaging_port do_bind = bind_messaging_port::yes);
|
||||
void join_token_ring(int delay);
|
||||
public:
|
||||
future<> join_ring();
|
||||
@@ -2293,6 +2282,10 @@ public:
|
||||
bool cluster_supports_mc_sstable() const {
|
||||
return bool(_mc_sstable_feature);
|
||||
}
|
||||
|
||||
const gms::feature& cluster_supports_correct_static_compact_in_mc() const {
|
||||
return _correct_static_compact_in_mc;
|
||||
}
|
||||
private:
|
||||
future<> set_cql_ready(bool ready);
|
||||
private:
|
||||
@@ -2303,12 +2296,8 @@ private:
|
||||
void notify_cql_change(inet_address endpoint, bool ready);
|
||||
};
|
||||
|
||||
inline future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks) {
|
||||
return service::get_storage_service().start(std::ref(db), std::ref(auth_service), std::ref(sys_dist_ks));
|
||||
}
|
||||
|
||||
inline future<> deinit_storage_service() {
|
||||
return service::get_storage_service().stop();
|
||||
}
|
||||
future<> init_storage_service(distributed<database>& db, sharded<auth::service>& auth_service, sharded<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
sharded<gms::feature_service>& feature_service);
|
||||
future<> deinit_storage_service();
|
||||
|
||||
}
|
||||
|
||||
@@ -179,6 +179,8 @@ struct compaction_read_monitor_generator final : public read_monitor_generator {
|
||||
void remove_sstable(bool is_tracking) {
|
||||
if (is_tracking) {
|
||||
_cf.get_compaction_strategy().get_backlog_tracker().remove_sstable(_sst);
|
||||
} else if (_sst) {
|
||||
_cf.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
|
||||
}
|
||||
_sst = {};
|
||||
}
|
||||
@@ -303,6 +305,7 @@ public:
|
||||
class compaction {
|
||||
protected:
|
||||
column_family& _cf;
|
||||
schema_ptr _schema;
|
||||
std::vector<shared_sstable> _sstables;
|
||||
uint64_t _max_sstable_size;
|
||||
uint32_t _sstable_level;
|
||||
@@ -313,6 +316,7 @@ protected:
|
||||
protected:
|
||||
compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
|
||||
: _cf(cf)
|
||||
, _schema(cf.schema())
|
||||
, _sstables(std::move(sstables))
|
||||
, _max_sstable_size(max_sstable_size)
|
||||
, _sstable_level(sstable_level)
|
||||
@@ -361,10 +365,9 @@ private:
|
||||
virtual flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const = 0;
|
||||
|
||||
flat_mutation_reader setup() {
|
||||
auto ssts = make_lw_shared<sstables::sstable_set>(_cf.get_compaction_strategy().make_sstable_set(_cf.schema()));
|
||||
auto schema = _cf.schema();
|
||||
auto ssts = make_lw_shared<sstables::sstable_set>(_cf.get_compaction_strategy().make_sstable_set(_schema));
|
||||
sstring formatted_msg = "[";
|
||||
auto fully_expired = get_fully_expired_sstables(_cf, _sstables, gc_clock::now() - schema->gc_grace_seconds());
|
||||
auto fully_expired = get_fully_expired_sstables(_cf, _sstables, gc_clock::now() - _schema->gc_grace_seconds());
|
||||
|
||||
for (auto& sst : _sstables) {
|
||||
// Compacted sstable keeps track of its ancestors.
|
||||
@@ -396,8 +399,8 @@ private:
|
||||
}
|
||||
formatted_msg += "]";
|
||||
_info->sstables = _sstables.size();
|
||||
_info->ks = schema->ks_name();
|
||||
_info->cf = schema->cf_name();
|
||||
_info->ks = _schema->ks_name();
|
||||
_info->cf = _schema->cf_name();
|
||||
report_start(formatted_msg);
|
||||
|
||||
return make_sstable_reader(std::move(ssts));
|
||||
@@ -462,7 +465,7 @@ private:
|
||||
}
|
||||
|
||||
const schema_ptr& schema() const {
|
||||
return _cf.schema();
|
||||
return _schema;
|
||||
}
|
||||
public:
|
||||
static future<compaction_info> run(std::unique_ptr<compaction> c);
|
||||
@@ -518,10 +521,10 @@ public:
|
||||
}
|
||||
|
||||
flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const override {
|
||||
return ::make_local_shard_sstable_reader(_cf.schema(),
|
||||
return ::make_local_shard_sstable_reader(_schema,
|
||||
std::move(ssts),
|
||||
query::full_partition_range,
|
||||
_cf.schema()->full_slice(),
|
||||
_schema->full_slice(),
|
||||
service::get_local_compaction_priority(),
|
||||
no_resource_tracking(),
|
||||
nullptr,
|
||||
@@ -570,7 +573,7 @@ public:
|
||||
cfg.monitor = &_active_write_monitors.back();
|
||||
cfg.large_partition_handler = _cf.get_large_partition_handler();
|
||||
// TODO: calculate encoding_stats based on statistics of compacted sstables
|
||||
_writer.emplace(_sst->get_writer(*_cf.schema(), partitions_per_sstable(), cfg, encoding_stats{}, priority));
|
||||
_writer.emplace(_sst->get_writer(*_schema, partitions_per_sstable(), cfg, encoding_stats{}, priority));
|
||||
}
|
||||
return &*_writer;
|
||||
}
|
||||
@@ -610,7 +613,7 @@ public:
|
||||
}
|
||||
|
||||
std::function<bool(const dht::decorated_key&)> filter_func() const override {
|
||||
dht::token_range_vector owned_ranges = service::get_local_storage_service().get_local_ranges(_cf.schema()->ks_name());
|
||||
dht::token_range_vector owned_ranges = service::get_local_storage_service().get_local_ranges(_schema->ks_name());
|
||||
|
||||
return [this, owned_ranges = std::move(owned_ranges)] (const dht::decorated_key& dk) {
|
||||
if (dht::shard_of(dk.token()) != engine().cpu_id()) {
|
||||
@@ -684,10 +687,10 @@ public:
|
||||
|
||||
// Use reader that makes sure no non-local mutation will not be filtered out.
|
||||
flat_mutation_reader make_sstable_reader(lw_shared_ptr<sstables::sstable_set> ssts) const override {
|
||||
return ::make_range_sstable_reader(_cf.schema(),
|
||||
return ::make_range_sstable_reader(_schema,
|
||||
std::move(ssts),
|
||||
query::full_partition_range,
|
||||
_cf.schema()->full_slice(),
|
||||
_schema->full_slice(),
|
||||
service::get_local_compaction_priority(),
|
||||
no_resource_tracking(),
|
||||
nullptr,
|
||||
@@ -719,7 +722,7 @@ public:
|
||||
cfg.large_partition_handler = _cf.get_large_partition_handler();
|
||||
auto&& priority = service::get_local_compaction_priority();
|
||||
// TODO: calculate encoding_stats based on statistics of compacted sstables
|
||||
writer.emplace(sst->get_writer(*_cf.schema(), partitions_per_sstable(_shard), cfg, encoding_stats{}, priority, _shard));
|
||||
writer.emplace(sst->get_writer(*_schema, partitions_per_sstable(_shard), cfg, encoding_stats{}, priority, _shard));
|
||||
}
|
||||
return &*writer;
|
||||
}
|
||||
|
||||
@@ -66,6 +66,14 @@ public:
|
||||
_cm->deregister_compacting_sstables(_compacting);
|
||||
}
|
||||
}
|
||||
|
||||
// Explicitly release compacting sstables
|
||||
void release_compacting(const std::vector<sstables::shared_sstable>& sstables) {
|
||||
_cm->deregister_compacting_sstables(sstables);
|
||||
for (auto& sst : sstables) {
|
||||
_compacting.erase(boost::remove(_compacting, sst), _compacting.end());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
compaction_weight_registration::compaction_weight_registration(compaction_manager* cm, int weight)
|
||||
@@ -564,18 +572,24 @@ future<> compaction_manager::perform_cleanup(column_family* cf) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
column_family& cf = *task->compacting_cf;
|
||||
sstables::compaction_descriptor descriptor = sstables::compaction_descriptor(get_candidates(cf));
|
||||
auto compacting = compacting_sstable_registration(this, descriptor.sstables);
|
||||
auto sstables = get_candidates(cf);
|
||||
auto compacting = make_lw_shared<compacting_sstable_registration>(this, sstables);
|
||||
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
|
||||
return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
|
||||
return cf.cleanup_sstables(std::move(descriptor));
|
||||
return do_with(std::move(user_initiated), std::move(sstables), [this, &cf, compacting] (compaction_backlog_tracker& bt,
|
||||
std::vector<sstables::shared_sstable>& sstables) mutable {
|
||||
return with_scheduling_group(_scheduling_group, [this, &cf, &sstables, compacting] () mutable {
|
||||
return do_for_each(sstables, [this, &cf, compacting] (auto& sst) {
|
||||
return cf.cleanup_sstables(sstables::compaction_descriptor({sst})).then([&sst, compacting] {
|
||||
// Releases reference to cleaned sstable such that respective used disk space can be freed.
|
||||
compacting->release_compacting({std::move(sst)});
|
||||
});
|
||||
});
|
||||
});
|
||||
}).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
|
||||
}).then_wrapped([this, task, compacting] (future<> f) mutable {
|
||||
task->compaction_running = false;
|
||||
_stats.active_tasks--;
|
||||
if (!can_proceed(task)) {
|
||||
|
||||
@@ -170,7 +170,10 @@ public:
|
||||
_sstables.push_back(std::move(sst));
|
||||
}
|
||||
virtual void erase(shared_sstable sst) override {
|
||||
_sstables.erase(boost::range::find(_sstables, sst));
|
||||
auto it = boost::range::find(_sstables, sst);
|
||||
if (it != _sstables.end()){
|
||||
_sstables.erase(it);
|
||||
}
|
||||
}
|
||||
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
|
||||
class incremental_selector;
|
||||
@@ -420,11 +423,6 @@ public:
|
||||
auto itw = writes_per_window.find(bound);
|
||||
if (itw != writes_per_window.end()) {
|
||||
ow_this_window = &itw->second;
|
||||
// We will erase here so we can keep track of which
|
||||
// writes belong to existing windows. Writes that don't belong to any window
|
||||
// are writes in progress to new windows and will be accounted in the final
|
||||
// loop before we return
|
||||
writes_per_window.erase(itw);
|
||||
}
|
||||
auto* oc_this_window = &no_oc;
|
||||
auto itc = compactions_per_window.find(bound);
|
||||
@@ -432,6 +430,13 @@ public:
|
||||
oc_this_window = &itc->second;
|
||||
}
|
||||
b += windows.second.backlog(*ow_this_window, *oc_this_window);
|
||||
if (itw != writes_per_window.end()) {
|
||||
// We will erase here so we can keep track of which
|
||||
// writes belong to existing windows. Writes that don't belong to any window
|
||||
// are writes in progress to new windows and will be accounted in the final
|
||||
// loop before we return
|
||||
writes_per_window.erase(itw);
|
||||
}
|
||||
}
|
||||
|
||||
// Partial writes that don't belong to any window are accounted here.
|
||||
|
||||
@@ -390,9 +390,17 @@ private:
|
||||
}
|
||||
|
||||
return do_with(std::make_unique<reader>(_sstable, _pc, position, end, quantity), [this, summary_idx] (auto& entries_reader) {
|
||||
return entries_reader->_context.consume_input().then([this, summary_idx, &entries_reader] {
|
||||
return entries_reader->_context.consume_input().then_wrapped([this, summary_idx, &entries_reader] (future<> f) {
|
||||
std::exception_ptr ex;
|
||||
if (f.failed()) {
|
||||
ex = f.get_exception();
|
||||
sstlog.error("failed reading index for {}: {}", _sstable->get_filename(), ex);
|
||||
}
|
||||
auto indexes = std::move(entries_reader->_consumer.indexes);
|
||||
return entries_reader->_context.close().then([indexes = std::move(indexes)] () mutable {
|
||||
return entries_reader->_context.close().then([indexes = std::move(indexes), ex = std::move(ex)] () mutable {
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
return std::move(indexes);
|
||||
});
|
||||
|
||||
|
||||
@@ -72,8 +72,11 @@ inline gc_clock::duration parse_ttl(int32_t value) {
|
||||
|
||||
inline gc_clock::duration parse_ttl(const serialization_header& header,
|
||||
uint64_t delta) {
|
||||
int32_t _delta = static_cast<int32_t>(delta);
|
||||
return parse_ttl(header.get_min_ttl() + _delta);
|
||||
// sign-extend min_ttl back to 64 bits and
|
||||
// add the delta using unsigned arithmetic
|
||||
// to prevent signed integer overflow
|
||||
uint64_t min_ttl = static_cast<uint64_t>(static_cast<int64_t>(header.get_min_ttl()));
|
||||
return parse_ttl(static_cast<int32_t>(min_ttl + delta));
|
||||
}
|
||||
|
||||
inline gc_clock::time_point parse_expiry(int32_t value) {
|
||||
@@ -85,8 +88,11 @@ inline gc_clock::time_point parse_expiry(int32_t value) {
|
||||
|
||||
inline gc_clock::time_point parse_expiry(const serialization_header& header,
|
||||
uint64_t delta) {
|
||||
int32_t _delta = static_cast<int32_t>(delta);
|
||||
return parse_expiry(header.get_min_local_deletion_time() + _delta);
|
||||
// sign-extend min_local_deletion_time back to 64 bits and
|
||||
// add the delta using unsigned arithmetic
|
||||
// to prevent signed integer overflow
|
||||
uint64_t min_local_deletion_time = static_cast<uint64_t>(static_cast<int64_t>(header.get_min_local_deletion_time()));
|
||||
return parse_expiry(static_cast<int32_t>(min_local_deletion_time + delta));
|
||||
}
|
||||
|
||||
}; // namespace sstables
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "sstables/mc/types.hh"
|
||||
#include "db/config.hh"
|
||||
#include "atomic_cell.hh"
|
||||
#include "utils/exceptions.hh"
|
||||
|
||||
#include <functional>
|
||||
#include <boost/iterator/iterator_facade.hpp>
|
||||
@@ -308,9 +309,11 @@ void write_missing_columns(W& out, const indexed_columns& columns, const row& ro
|
||||
template <typename T, typename W>
|
||||
GCC6_CONCEPT(requires Writer<W>())
|
||||
void write_unsigned_delta_vint(W& out, T value, T base) {
|
||||
using unsigned_type = std::make_unsigned_t<T>;
|
||||
unsigned_type unsigned_delta = static_cast<unsigned_type>(value) - static_cast<unsigned_type>(base);
|
||||
// sign-extend to 64-bits
|
||||
using signed_type = std::make_signed_t<T>;
|
||||
int64_t delta = static_cast<signed_type>(value) - static_cast<signed_type>(base);
|
||||
int64_t delta = static_cast<int64_t>(static_cast<signed_type>(unsigned_delta));
|
||||
// write as unsigned 64-bit varint
|
||||
write_vint(out, static_cast<uint64_t>(delta));
|
||||
}
|
||||
@@ -370,12 +373,21 @@ static sstring pk_type_to_string(const schema& s) {
|
||||
}
|
||||
}
|
||||
|
||||
serialization_header make_serialization_header(const schema& s, const encoding_stats& enc_stats) {
|
||||
struct sstable_schema {
|
||||
serialization_header header;
|
||||
indexed_columns regular_columns;
|
||||
indexed_columns static_columns;
|
||||
};
|
||||
|
||||
sstable_schema make_sstable_schema(const schema& s, const encoding_stats& enc_stats, const sstable_writer_config& cfg) {
|
||||
sstable_schema sst_sch;
|
||||
serialization_header& header = sst_sch.header;
|
||||
// mc serialization header minimum values are delta-encoded based on the default timestamp epoch times
|
||||
header.min_timestamp_base.value = static_cast<uint64_t>(enc_stats.min_timestamp - encoding_stats::timestamp_epoch);
|
||||
header.min_local_deletion_time_base.value = static_cast<uint64_t>(enc_stats.min_local_deletion_time - encoding_stats::deletion_time_epoch);
|
||||
header.min_ttl_base.value = static_cast<uint64_t>(enc_stats.min_ttl - encoding_stats::ttl_epoch);
|
||||
// Note: We rely on implicit conversion to uint64_t when subtracting the signed epoch values below
|
||||
// for preventing signed integer overflow.
|
||||
header.min_timestamp_base.value = static_cast<uint64_t>(enc_stats.min_timestamp) - encoding_stats::timestamp_epoch;
|
||||
header.min_local_deletion_time_base.value = static_cast<uint64_t>(enc_stats.min_local_deletion_time) - encoding_stats::deletion_time_epoch;
|
||||
header.min_ttl_base.value = static_cast<uint64_t>(enc_stats.min_ttl) - encoding_stats::ttl_epoch;
|
||||
|
||||
header.pk_type_name = to_bytes_array_vint_size(pk_type_to_string(s));
|
||||
|
||||
@@ -385,23 +397,36 @@ serialization_header make_serialization_header(const schema& s, const encoding_s
|
||||
header.clustering_key_types_names.elements.push_back(std::move(ck_type_name));
|
||||
}
|
||||
|
||||
header.static_columns.elements.reserve(s.static_columns_count());
|
||||
for (const auto& static_column : s.static_columns()) {
|
||||
auto add_column = [&] (const column_definition& column) {
|
||||
serialization_header::column_desc cd;
|
||||
cd.name = to_bytes_array_vint_size(static_column.name());
|
||||
cd.type_name = to_bytes_array_vint_size(type_name_with_udt_frozen(static_column.type));
|
||||
header.static_columns.elements.push_back(std::move(cd));
|
||||
cd.name = to_bytes_array_vint_size(column.name());
|
||||
cd.type_name = to_bytes_array_vint_size(type_name_with_udt_frozen(column.type));
|
||||
if (column.is_static()) {
|
||||
header.static_columns.elements.push_back(std::move(cd));
|
||||
sst_sch.static_columns.push_back(column);
|
||||
} else if (column.is_regular()) {
|
||||
header.regular_columns.elements.push_back(std::move(cd));
|
||||
sst_sch.regular_columns.push_back(column);
|
||||
}
|
||||
};
|
||||
|
||||
if (cfg.correctly_serialize_static_compact_in_mc) {
|
||||
for (const auto& column : s.v3().all_columns()) {
|
||||
add_column(column);
|
||||
}
|
||||
} else {
|
||||
for (const auto& column : s.all_columns()) {
|
||||
add_column(column);
|
||||
}
|
||||
}
|
||||
|
||||
header.regular_columns.elements.reserve(s.regular_columns_count());
|
||||
for (const auto& regular_column : s.regular_columns()) {
|
||||
serialization_header::column_desc cd;
|
||||
cd.name = to_bytes_array_vint_size(regular_column.name());
|
||||
cd.type_name = to_bytes_array_vint_size(type_name_with_udt_frozen(regular_column.type));
|
||||
header.regular_columns.elements.push_back(std::move(cd));
|
||||
}
|
||||
// For static and regular columns, we write all simple columns first followed by collections
|
||||
// These containers have columns partitioned by atomicity
|
||||
auto pred = [] (const std::reference_wrapper<const column_definition>& column) { return column.get().is_atomic(); };
|
||||
boost::range::stable_partition(sst_sch.regular_columns, pred);
|
||||
boost::range::stable_partition(sst_sch.static_columns, pred);
|
||||
|
||||
return header;
|
||||
return sst_sch;
|
||||
}
|
||||
|
||||
enum class cell_flags : uint8_t {
|
||||
@@ -507,18 +532,6 @@ GCC6_CONCEPT(
|
||||
};
|
||||
)
|
||||
|
||||
static indexed_columns get_indexed_columns_partitioned_by_atomicity(schema::const_iterator_range_type columns) {
|
||||
indexed_columns result;
|
||||
result.reserve(columns.size());
|
||||
for (const auto& col: columns) {
|
||||
result.emplace_back(col);
|
||||
}
|
||||
boost::range::stable_partition(
|
||||
result,
|
||||
[](const std::reference_wrapper<const column_definition>& column) { return column.get().is_atomic();});
|
||||
return result;
|
||||
}
|
||||
|
||||
// Used for writing SSTables in 'mc' format.
|
||||
class writer : public sstable_writer::writer_impl {
|
||||
private:
|
||||
@@ -526,7 +539,7 @@ private:
|
||||
shard_id _shard; // Specifies which shard the new SStable will belong to.
|
||||
bool _compression_enabled = false;
|
||||
std::unique_ptr<file_writer> _data_writer;
|
||||
std::optional<file_writer> _index_writer;
|
||||
std::unique_ptr<file_writer> _index_writer;
|
||||
bool _tombstone_written = false;
|
||||
bool _static_row_written = false;
|
||||
// The length of partition header (partition key, partition deletion and static row, if present)
|
||||
@@ -540,10 +553,7 @@ private:
|
||||
range_tombstone_stream _range_tombstones;
|
||||
bytes_ostream _tmp_bufs;
|
||||
|
||||
// For static and regular columns, we write all simple columns first followed by collections
|
||||
// These containers have columns partitioned by atomicity
|
||||
const indexed_columns _static_columns;
|
||||
const indexed_columns _regular_columns;
|
||||
const sstable_schema _sst_schema;
|
||||
|
||||
struct cdef_and_collection {
|
||||
const column_definition* cdef;
|
||||
@@ -571,7 +581,11 @@ private:
|
||||
struct {
|
||||
// Unfortunately we cannot output the promoted index directly to the
|
||||
// index file because it needs to be prepended by its size.
|
||||
seastar::circular_buffer<pi_block> promoted_index;
|
||||
// first_entry is used for deferring serialization into blocks for small partitions.
|
||||
std::optional<pi_block> first_entry;
|
||||
bytes_ostream blocks; // Serialized pi_blocks.
|
||||
bytes_ostream offsets; // Serialized block offsets (uint32_t) relative to the start of "blocks".
|
||||
uint64_t promoted_index_size = 0; // Number of pi_blocks inside blocks and first_entry;
|
||||
tombstone tomb;
|
||||
uint64_t block_start_offset;
|
||||
uint64_t block_next_start_offset;
|
||||
@@ -580,8 +594,13 @@ private:
|
||||
size_t desired_block_size;
|
||||
} _pi_write_m;
|
||||
column_stats _c_stats;
|
||||
bool _write_regular_as_static; // See #4139
|
||||
|
||||
void init_file_writers();
|
||||
|
||||
// Returns the closed writer
|
||||
std::unique_ptr<file_writer> close_writer(std::unique_ptr<file_writer>& w);
|
||||
|
||||
void close_data_writer();
|
||||
void ensure_tombstone_is_written() {
|
||||
if (!_tombstone_written) {
|
||||
@@ -590,7 +609,7 @@ private:
|
||||
}
|
||||
|
||||
void ensure_static_row_is_written_if_needed() {
|
||||
if (!_static_columns.empty() && !_static_row_written) {
|
||||
if (!_sst_schema.static_columns.empty() && !_static_row_written) {
|
||||
consume(static_row{});
|
||||
}
|
||||
}
|
||||
@@ -606,6 +625,7 @@ private:
|
||||
void maybe_set_pi_first_clustering(const clustering_info& info);
|
||||
void maybe_add_pi_block();
|
||||
void add_pi_block();
|
||||
void write_pi_block(const pi_block&);
|
||||
|
||||
void update_deletion_time_stats(deletion_time dt) {
|
||||
_c_stats.update_timestamp(dt.marked_for_delete_at);
|
||||
@@ -643,7 +663,7 @@ private:
|
||||
|
||||
// Writes single atomic cell
|
||||
void write_cell(bytes_ostream& writer, atomic_cell_view cell, const column_definition& cdef,
|
||||
const row_time_properties& properties, bytes_view cell_path = {});
|
||||
const row_time_properties& properties, std::optional<bytes_view> cell_path = {});
|
||||
|
||||
// Writes information about row liveness (formerly 'row marker')
|
||||
void write_liveness_info(bytes_ostream& writer, const row_marker& marker);
|
||||
@@ -654,7 +674,7 @@ private:
|
||||
|
||||
void write_cells(bytes_ostream& writer, column_kind kind, const row& row_body, const row_time_properties& properties, bool has_complex_deletion);
|
||||
void write_row_body(bytes_ostream& writer, const clustering_row& row, bool has_complex_deletion);
|
||||
void write_static_row(const row& static_row);
|
||||
void write_static_row(const row&, column_kind);
|
||||
|
||||
// Clustered is a term used to denote an entity that has a clustering key prefix
|
||||
// and constitutes an entry of a partition.
|
||||
@@ -675,15 +695,19 @@ private:
|
||||
_prev_row_start = pos;
|
||||
maybe_add_pi_block();
|
||||
}
|
||||
void write_promoted_index(file_writer& writer);
|
||||
void write_promoted_index();
|
||||
void consume(rt_marker&& marker);
|
||||
|
||||
void flush_tmp_bufs() {
|
||||
void flush_tmp_bufs(file_writer& writer) {
|
||||
for (auto&& buf : _tmp_bufs) {
|
||||
_data_writer->write(buf);
|
||||
writer.write(buf);
|
||||
}
|
||||
_tmp_bufs.clear();
|
||||
}
|
||||
|
||||
void flush_tmp_bufs() {
|
||||
flush_tmp_bufs(*_data_writer);
|
||||
}
|
||||
public:
|
||||
|
||||
writer(sstable& sst, const schema& s, uint64_t estimated_partitions,
|
||||
@@ -694,8 +718,8 @@ public:
|
||||
, _shard(shard)
|
||||
, _range_tombstones(_schema)
|
||||
, _tmp_bufs(_sst.sstable_buffer_size)
|
||||
, _static_columns(get_indexed_columns_partitioned_by_atomicity(s.static_columns()))
|
||||
, _regular_columns(get_indexed_columns_partitioned_by_atomicity(s.regular_columns()))
|
||||
, _sst_schema(make_sstable_schema(s, _enc_stats, _cfg))
|
||||
, _write_regular_as_static(cfg.correctly_serialize_static_compact_in_mc && s.is_static_compact_table())
|
||||
{
|
||||
_sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
|
||||
_sst.write_toc(_pc);
|
||||
@@ -760,12 +784,25 @@ static deletion_time to_deletion_time(tombstone t) {
|
||||
}
|
||||
|
||||
void writer::add_pi_block() {
|
||||
_pi_write_m.promoted_index.push_back({
|
||||
auto block = pi_block{
|
||||
*_pi_write_m.first_clustering,
|
||||
*_pi_write_m.last_clustering,
|
||||
_pi_write_m.block_start_offset - _c_stats.start_offset,
|
||||
_data_writer->offset() - _pi_write_m.block_start_offset,
|
||||
(_end_open_marker ? std::make_optional(_end_open_marker->tomb) : std::optional<tombstone>{})});
|
||||
(_end_open_marker ? std::make_optional(_end_open_marker->tomb) : std::optional<tombstone>{})};
|
||||
|
||||
if (_pi_write_m.blocks.empty()) {
|
||||
if (!_pi_write_m.first_entry) {
|
||||
_pi_write_m.first_entry.emplace(std::move(block));
|
||||
++_pi_write_m.promoted_index_size;
|
||||
return;
|
||||
} else {
|
||||
write_pi_block(*_pi_write_m.first_entry);
|
||||
}
|
||||
}
|
||||
|
||||
write_pi_block(block);
|
||||
++_pi_write_m.promoted_index_size;
|
||||
}
|
||||
|
||||
void writer::maybe_add_pi_block() {
|
||||
@@ -793,13 +830,17 @@ void writer::init_file_writers() {
|
||||
&_sst._components->compression,
|
||||
_schema.get_compressor_params()));
|
||||
}
|
||||
_index_writer.emplace(std::move(_sst._index_file), options);
|
||||
_index_writer = std::make_unique<file_writer>(std::move(_sst._index_file), options);
|
||||
}
|
||||
|
||||
std::unique_ptr<file_writer> writer::close_writer(std::unique_ptr<file_writer>& w) {
|
||||
auto writer = std::move(w);
|
||||
writer->close();
|
||||
return writer;
|
||||
}
|
||||
|
||||
void writer::close_data_writer() {
|
||||
auto writer = std::move(_data_writer);
|
||||
writer->close();
|
||||
|
||||
auto writer = close_writer(_data_writer);
|
||||
if (!_compression_enabled) {
|
||||
auto chksum_wr = static_cast<crc32_checksummed_file_writer*>(writer.get());
|
||||
_sst.write_digest(chksum_wr->full_checksum());
|
||||
@@ -900,7 +941,10 @@ void writer::consume_new_partition(const dht::decorated_key& dk) {
|
||||
write(_sst.get_version(), *_index_writer, p_key);
|
||||
write_vint(*_index_writer, _data_writer->offset());
|
||||
|
||||
_pi_write_m.promoted_index = {};
|
||||
_pi_write_m.first_entry.reset();
|
||||
_pi_write_m.blocks.clear();
|
||||
_pi_write_m.offsets.clear();
|
||||
_pi_write_m.promoted_index_size = 0;
|
||||
_pi_write_m.tomb = {};
|
||||
_pi_write_m.first_clustering.reset();
|
||||
_pi_write_m.last_clustering.reset();
|
||||
@@ -926,7 +970,7 @@ void writer::consume(tombstone t) {
|
||||
}
|
||||
|
||||
void writer::write_cell(bytes_ostream& writer, atomic_cell_view cell, const column_definition& cdef,
|
||||
const row_time_properties& properties, bytes_view cell_path) {
|
||||
const row_time_properties& properties, std::optional<bytes_view> cell_path) {
|
||||
|
||||
bool is_deleted = !cell.is_live();
|
||||
bool has_value = !is_deleted && !cell.value().empty();
|
||||
@@ -938,7 +982,7 @@ void writer::write_cell(bytes_ostream& writer, atomic_cell_view cell, const colu
|
||||
properties.local_deletion_time == cell.deletion_time().time_since_epoch().count();
|
||||
|
||||
cell_flags flags = cell_flags::none;
|
||||
if (!has_value) {
|
||||
if ((!has_value && !cdef.is_counter()) || is_deleted) {
|
||||
flags |= cell_flags::has_empty_value_mask;
|
||||
}
|
||||
if (is_deleted) {
|
||||
@@ -967,20 +1011,22 @@ void writer::write_cell(bytes_ostream& writer, atomic_cell_view cell, const colu
|
||||
}
|
||||
}
|
||||
|
||||
if (!cell_path.empty()) {
|
||||
write_vint(writer, cell_path.size());
|
||||
write(_sst.get_version(), writer, cell_path);
|
||||
if (bool(cell_path)) {
|
||||
write_vint(writer, cell_path->size());
|
||||
write(_sst.get_version(), writer, *cell_path);
|
||||
}
|
||||
|
||||
if (has_value) {
|
||||
if (cdef.is_counter()) {
|
||||
if (cdef.is_counter()) {
|
||||
if (!is_deleted) {
|
||||
assert(!cell.is_counter_update());
|
||||
counter_cell_view::with_linearized(cell, [&] (counter_cell_view ccv) {
|
||||
write_counter_value(ccv, writer, sstable_version_types::mc, [] (bytes_ostream& out, uint32_t value) {
|
||||
return write_vint(out, value);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
}
|
||||
} else {
|
||||
if (has_value) {
|
||||
write_cell_value(writer, *cdef.type, cell.value());
|
||||
}
|
||||
}
|
||||
@@ -1061,7 +1107,7 @@ void writer::write_cells(bytes_ostream& writer, column_kind kind, const row& row
|
||||
// This differs from Origin where all updated columns are tracked and the set of filled columns of a row
|
||||
// is compared with the set of all columns filled in the memtable. So our encoding may be less optimal in some cases
|
||||
// but still valid.
|
||||
write_missing_columns(writer, kind == column_kind::static_column ? _static_columns : _regular_columns, row_body);
|
||||
write_missing_columns(writer, kind == column_kind::static_column ? _sst_schema.static_columns : _sst_schema.regular_columns, row_body);
|
||||
row_body.for_each_cell([this, &writer, kind, &properties, has_complex_deletion] (column_id id, const atomic_cell_or_collection& c) {
|
||||
auto&& column_definition = _schema.column_at(kind, id);
|
||||
if (!column_definition.is_atomic()) {
|
||||
@@ -1105,18 +1151,6 @@ void writer::write_row_body(bytes_ostream& writer, const clustering_row& row, bo
|
||||
return write_cells(writer, column_kind::regular_column, row.cells(), properties, has_complex_deletion);
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
uint64_t calculate_write_size(Func&& func) {
|
||||
uint64_t written_size = 0;
|
||||
{
|
||||
auto counting_writer = file_writer(make_sizing_output_stream(written_size));
|
||||
func(counting_writer);
|
||||
counting_writer.flush();
|
||||
counting_writer.close();
|
||||
}
|
||||
return written_size;
|
||||
}
|
||||
|
||||
// Find if any collection in the row contains a collection-wide tombstone
|
||||
static bool row_has_complex_deletion(const schema& s, const row& r, column_kind kind) {
|
||||
bool result = false;
|
||||
@@ -1138,16 +1172,14 @@ static bool row_has_complex_deletion(const schema& s, const row& r, column_kind
|
||||
return result;
|
||||
}
|
||||
|
||||
void writer::write_static_row(const row& static_row) {
|
||||
assert(_schema.is_compound());
|
||||
|
||||
void writer::write_static_row(const row& static_row, column_kind kind) {
|
||||
uint64_t current_pos = _data_writer->offset();
|
||||
// Static row flag is stored in extended flags so extension_flag is always set for static rows
|
||||
row_flags flags = row_flags::extension_flag;
|
||||
if (static_row.size() == _schema.static_columns_count()) {
|
||||
if (static_row.size() == _sst_schema.static_columns.size()) {
|
||||
flags |= row_flags::has_all_columns;
|
||||
}
|
||||
bool has_complex_deletion = row_has_complex_deletion(_schema, static_row, column_kind::static_column);
|
||||
bool has_complex_deletion = row_has_complex_deletion(_schema, static_row, kind);
|
||||
if (has_complex_deletion) {
|
||||
flags |= row_flags::has_complex_deletion;
|
||||
}
|
||||
@@ -1161,14 +1193,13 @@ void writer::write_static_row(const row& static_row) {
|
||||
|
||||
_partition_header_length += (_data_writer->offset() - current_pos);
|
||||
|
||||
// Collect statistics
|
||||
++_c_stats.rows_count;
|
||||
_static_row_written = true;
|
||||
}
|
||||
|
||||
stop_iteration writer::consume(static_row&& sr) {
|
||||
ensure_tombstone_is_written();
|
||||
write_static_row(sr.cells());
|
||||
_static_row_written = true;
|
||||
write_static_row(sr.cells(), column_kind::static_column);
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
@@ -1191,7 +1222,7 @@ void writer::write_clustered(const clustering_row& clustered_row, uint64_t prev_
|
||||
ext_flags = row_extended_flags::has_shadowable_deletion_scylla;
|
||||
}
|
||||
|
||||
if (clustered_row.cells().size() == _schema.regular_columns_count()) {
|
||||
if (clustered_row.cells().size() == _sst_schema.regular_columns.size()) {
|
||||
flags |= row_flags::has_all_columns;
|
||||
}
|
||||
bool has_complex_deletion = row_has_complex_deletion(_schema, clustered_row.cells(), column_kind::regular_column);
|
||||
@@ -1221,6 +1252,11 @@ void writer::write_clustered(const clustering_row& clustered_row, uint64_t prev_
|
||||
}
|
||||
|
||||
stop_iteration writer::consume(clustering_row&& cr) {
|
||||
if (_write_regular_as_static) {
|
||||
ensure_tombstone_is_written();
|
||||
write_static_row(cr.cells(), column_kind::regular_column);
|
||||
return stop_iteration::no;
|
||||
}
|
||||
drain_tombstones(position_in_partition_view::after_key(cr.key()));
|
||||
write_clustered(cr);
|
||||
return stop_iteration::no;
|
||||
@@ -1242,28 +1278,33 @@ static void write_clustering_prefix(W& writer, bound_kind_m kind,
|
||||
write_clustering_prefix(writer, s, clustering, is_ephemerally_full);
|
||||
}
|
||||
|
||||
void writer::write_promoted_index(file_writer& writer) {
|
||||
static constexpr size_t width_base = 65536;
|
||||
write_vint(writer, _partition_header_length);
|
||||
write(_sst.get_version(), writer, to_deletion_time(_pi_write_m.tomb));
|
||||
write_vint(writer, _pi_write_m.promoted_index.size());
|
||||
std::vector<uint32_t> offsets;
|
||||
offsets.reserve(_pi_write_m.promoted_index.size());
|
||||
uint64_t start = writer.offset();
|
||||
for (const pi_block& block: _pi_write_m.promoted_index) {
|
||||
offsets.push_back(writer.offset() - start);
|
||||
write_clustering_prefix(writer, block.first.kind, _schema, block.first.clustering);
|
||||
write_clustering_prefix(writer, block.last.kind, _schema, block.last.clustering);
|
||||
write_vint(writer, block.offset);
|
||||
write_signed_vint(writer, block.width - width_base);
|
||||
write(_sst.get_version(), writer, static_cast<std::byte>(block.open_marker ? 1 : 0));
|
||||
if (block.open_marker) {
|
||||
write(sstable_version_types::mc, writer, to_deletion_time(*block.open_marker));
|
||||
}
|
||||
void writer::write_promoted_index() {
|
||||
if (_pi_write_m.promoted_index_size < 2) {
|
||||
write_vint(*_index_writer, uint64_t(0));
|
||||
return;
|
||||
}
|
||||
write_vint(_tmp_bufs, _partition_header_length);
|
||||
write(_sst.get_version(), _tmp_bufs, to_deletion_time(_pi_write_m.tomb));
|
||||
write_vint(_tmp_bufs, _pi_write_m.promoted_index_size);
|
||||
uint64_t pi_size = _tmp_bufs.size() + _pi_write_m.blocks.size() + _pi_write_m.offsets.size();
|
||||
write_vint(*_index_writer, pi_size);
|
||||
flush_tmp_bufs(*_index_writer);
|
||||
write(_sst.get_version(), *_index_writer, _pi_write_m.blocks);
|
||||
write(_sst.get_version(), *_index_writer, _pi_write_m.offsets);
|
||||
}
|
||||
|
||||
for (uint32_t offset: offsets) {
|
||||
write(_sst.get_version(), writer, offset);
|
||||
void writer::write_pi_block(const pi_block& block) {
|
||||
static constexpr size_t width_base = 65536;
|
||||
bytes_ostream& blocks = _pi_write_m.blocks;
|
||||
uint32_t offset = blocks.size();
|
||||
write(_sst.get_version(), _pi_write_m.offsets, offset);
|
||||
write_clustering_prefix(blocks, block.first.kind, _schema, block.first.clustering);
|
||||
write_clustering_prefix(blocks, block.last.kind, _schema, block.last.clustering);
|
||||
write_vint(blocks, block.offset);
|
||||
write_signed_vint(blocks, block.width - width_base);
|
||||
write(_sst.get_version(), blocks, static_cast<std::byte>(block.open_marker ? 1 : 0));
|
||||
if (block.open_marker) {
|
||||
write(sstable_version_types::mc, blocks, to_deletion_time(*block.open_marker));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1307,21 +1348,11 @@ stop_iteration writer::consume_end_of_partition() {
|
||||
|
||||
write(_sst.get_version(), *_data_writer, row_flags::end_of_partition);
|
||||
|
||||
if (!_pi_write_m.promoted_index.empty() && _pi_write_m.first_clustering) {
|
||||
if (_pi_write_m.promoted_index_size && _pi_write_m.first_clustering) {
|
||||
add_pi_block();
|
||||
}
|
||||
|
||||
auto write_pi = [this] (file_writer& writer) {
|
||||
return write_promoted_index(writer);
|
||||
};
|
||||
|
||||
if (_pi_write_m.promoted_index.size() < 2) {
|
||||
write_vint(*_index_writer, uint64_t(0));
|
||||
} else {
|
||||
uint64_t pi_size = calculate_write_size(write_pi);
|
||||
write_vint(*_index_writer, pi_size);
|
||||
write_pi(*_index_writer);
|
||||
}
|
||||
write_promoted_index();
|
||||
|
||||
// compute size of the current row.
|
||||
_c_stats.partition_size = _data_writer->offset() - _c_stats.start_offset;
|
||||
@@ -1336,10 +1367,15 @@ stop_iteration writer::consume_end_of_partition() {
|
||||
_first_key = *_partition_key;
|
||||
}
|
||||
_last_key = std::move(*_partition_key);
|
||||
_partition_key = std::nullopt;
|
||||
return get_data_offset() < _cfg.max_sstable_size ? stop_iteration::no : stop_iteration::yes;
|
||||
}
|
||||
|
||||
void writer::consume_end_of_stream() {
|
||||
if (_partition_key) {
|
||||
on_internal_error(sstlog, "Mutation stream ends with unclosed partition during write");
|
||||
}
|
||||
|
||||
_cfg.monitor->on_data_write_completed();
|
||||
|
||||
seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key), _index_sampling_state);
|
||||
@@ -1348,9 +1384,10 @@ void writer::consume_end_of_stream() {
|
||||
_sst.get_metadata_collector().add_compression_ratio(_sst._components->compression.compressed_file_length(), _sst._components->compression.uncompressed_file_length());
|
||||
}
|
||||
|
||||
_index_writer->close();
|
||||
_index_writer.reset();
|
||||
close_writer(_index_writer);
|
||||
_sst.set_first_and_last_keys();
|
||||
|
||||
_sst._components->statistics.contents[metadata_type::Serialization] = std::make_unique<serialization_header>(std::move(_sst_schema.header));
|
||||
seal_statistics(_sst.get_version(), _sst._components->statistics, _sst.get_metadata_collector(),
|
||||
dht::global_partitioner().name(), _schema.bloom_filter_fp_chance(),
|
||||
_sst._schema, _sst.get_first_decorated_key(), _sst.get_last_decorated_key(), _enc_stats);
|
||||
@@ -1363,6 +1400,9 @@ void writer::consume_end_of_stream() {
|
||||
if (!_cfg.correctly_serialize_non_compound_range_tombstones) {
|
||||
features.disable(sstable_feature::NonCompoundRangeTombstones);
|
||||
}
|
||||
if (!_cfg.correctly_serialize_static_compact_in_mc) {
|
||||
features.disable(sstable_feature::CorrectStaticCompact);
|
||||
}
|
||||
_sst.write_scylla_metadata(_pc, _shard, std::move(features));
|
||||
_cfg.monitor->on_write_completed();
|
||||
if (!_cfg.leave_unsealed) {
|
||||
|
||||
@@ -36,7 +36,5 @@ std::unique_ptr<sstable_writer::writer_impl> make_writer(sstable& sst,
|
||||
const io_priority_class& pc,
|
||||
shard_id shard);
|
||||
|
||||
serialization_header make_serialization_header(const schema&, const encoding_stats&);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,14 @@ namespace sstables {
|
||||
atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
|
||||
static constexpr size_t shard_size = 32;
|
||||
|
||||
if (value.empty()) {
|
||||
// This will never happen in a correct MC sstable but
|
||||
// we had a bug #4363 that caused empty counters
|
||||
// to be incorrectly stored inside sstables.
|
||||
counter_cell_builder ccb;
|
||||
return ccb.build(timestamp);
|
||||
}
|
||||
|
||||
data_input in(value);
|
||||
|
||||
auto header_size = in.read<int16_t>();
|
||||
@@ -53,13 +61,12 @@ atomic_cell make_counter_cell(api::timestamp_type timestamp, bytes_view value) {
|
||||
throw marshal_exception("encountered a local shard in a counter cell");
|
||||
}
|
||||
}
|
||||
auto shard_count = value.size() / shard_size;
|
||||
auto header_length = (size_t(header_size) + 1) * sizeof(int16_t);
|
||||
auto shard_count = (value.size() - header_length) / shard_size;
|
||||
if (shard_count != size_t(header_size)) {
|
||||
throw marshal_exception("encountered remote shards in a counter cell");
|
||||
}
|
||||
|
||||
std::vector<counter_shard> shards;
|
||||
shards.reserve(shard_count);
|
||||
counter_cell_builder ccb(shard_count);
|
||||
for (auto i = 0u; i < shard_count; i++) {
|
||||
auto id_hi = in.read<int64_t>();
|
||||
|
||||
@@ -702,9 +702,12 @@ public:
|
||||
// Sets streamed_mutation::_end_of_range when there are no more fragments for the query range.
|
||||
// Returns information whether the parser should continue to parse more
|
||||
// input and produce more fragments or we have collected enough and should yield.
|
||||
// Returns proceed:yes only when all pending fragments have been pushed.
|
||||
proceed push_ready_fragments() {
|
||||
if (_ready) {
|
||||
return push_ready_fragments_with_ready_set();
|
||||
if (push_ready_fragments_with_ready_set() == proceed::no) {
|
||||
return proceed::no;
|
||||
}
|
||||
}
|
||||
|
||||
if (_out_of_range) {
|
||||
@@ -808,6 +811,8 @@ class mp_row_consumer_m : public consumer_m {
|
||||
std::optional<new_mutation> _mutation;
|
||||
bool _is_mutation_end = true;
|
||||
streamed_mutation::forwarding _fwd;
|
||||
// For static-compact tables C* stores the only row in the static row but in our representation they're regular rows.
|
||||
const bool _treat_static_row_as_regular;
|
||||
|
||||
std::optional<clustering_row> _in_progress_row;
|
||||
std::optional<range_tombstone> _stored_tombstone;
|
||||
@@ -949,6 +954,8 @@ public:
|
||||
, _schema(schema)
|
||||
, _slice(slice)
|
||||
, _fwd(fwd)
|
||||
, _treat_static_row_as_regular(_schema->is_static_compact_table()
|
||||
&& (!sst->has_scylla_component() || sst->features().is_enabled(sstable_feature::CorrectStaticCompact))) // See #4139
|
||||
{
|
||||
_cells.reserve(std::max(_schema->static_columns_count(), _schema->regular_columns_count()));
|
||||
}
|
||||
@@ -1123,6 +1130,9 @@ public:
|
||||
|
||||
virtual consumer_m::row_processing_result consume_static_row_start() override {
|
||||
sstlog.trace("mp_row_consumer_m {}: consume_static_row_start()", this);
|
||||
if (_treat_static_row_as_regular) {
|
||||
return consume_row_start({});
|
||||
}
|
||||
_inside_static_row = true;
|
||||
_in_progress_static_row = static_row();
|
||||
return consumer_m::row_processing_result::do_proceed;
|
||||
|
||||
@@ -1023,9 +1023,26 @@ void sstable::write_simple(const T& component, const io_priority_class& pc) {
|
||||
options.buffer_size = sstable_buffer_size;
|
||||
options.io_priority_class = pc;
|
||||
auto w = file_writer(std::move(f), std::move(options));
|
||||
write(_version, w, component);
|
||||
w.flush();
|
||||
w.close();
|
||||
std::exception_ptr eptr;
|
||||
try {
|
||||
write(_version, w, component);
|
||||
w.flush();
|
||||
} catch (...) {
|
||||
eptr = std::current_exception();
|
||||
}
|
||||
try {
|
||||
w.close();
|
||||
} catch (...) {
|
||||
std::exception_ptr close_eptr = std::current_exception();
|
||||
sstlog.warn("failed to close file_writer: {}", close_eptr);
|
||||
// If write succeeded but close failed, we rethrow close's exception.
|
||||
if (!eptr) {
|
||||
eptr = close_eptr;
|
||||
}
|
||||
}
|
||||
if (eptr) {
|
||||
std::rethrow_exception(eptr);
|
||||
}
|
||||
}
|
||||
|
||||
template future<> sstable::read_simple<component_type::Filter>(sstables::filter& f, const io_priority_class& pc);
|
||||
@@ -1816,11 +1833,6 @@ void seal_statistics(sstable_version_types v, statistics& s, metadata_collector&
|
||||
collector.construct_stats(stats);
|
||||
s.contents[metadata_type::Stats] = std::make_unique<stats_metadata>(std::move(stats));
|
||||
|
||||
if (v == sstable_version_types::mc) {
|
||||
auto header = mc::make_serialization_header(*schema, enc_stats);
|
||||
s.contents[metadata_type::Serialization] = std::make_unique<serialization_header>(std::move(header));
|
||||
}
|
||||
|
||||
populate_statistics_offsets(v, s);
|
||||
}
|
||||
|
||||
@@ -2082,11 +2094,15 @@ stop_iteration components_writer::consume_end_of_partition() {
|
||||
_first_key = *_partition_key;
|
||||
}
|
||||
_last_key = std::move(*_partition_key);
|
||||
_partition_key = stdx::nullopt;
|
||||
|
||||
return get_offset() < _max_sstable_size ? stop_iteration::no : stop_iteration::yes;
|
||||
}
|
||||
|
||||
void components_writer::consume_end_of_stream() {
|
||||
if (_partition_key) {
|
||||
on_internal_error(sstlog, "Mutation stream ends with unclosed partition during write");
|
||||
}
|
||||
// what if there is only one partition? what if it is empty?
|
||||
seal_summary(_sst._components->summary, std::move(_first_key), std::move(_last_key), _index_sampling_state);
|
||||
|
||||
@@ -3084,6 +3100,10 @@ bool supports_correct_non_compound_range_tombstones() {
|
||||
return service::get_local_storage_service().cluster_supports_reading_correctly_serialized_range_tombstones();
|
||||
}
|
||||
|
||||
bool supports_correct_static_compact_in_mc() {
|
||||
return bool(service::get_local_storage_service().cluster_supports_correct_static_compact_in_mc());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const sstables::component_type& comp_type) {
|
||||
|
||||
@@ -104,6 +104,7 @@ class data_consume_context;
|
||||
class index_reader;
|
||||
|
||||
bool supports_correct_non_compound_range_tombstones();
|
||||
bool supports_correct_static_compact_in_mc();
|
||||
|
||||
struct sstable_writer_config {
|
||||
std::experimental::optional<size_t> promoted_index_block_size;
|
||||
@@ -113,6 +114,7 @@ struct sstable_writer_config {
|
||||
stdx::optional<db::replay_position> replay_position;
|
||||
write_monitor* monitor = &default_write_monitor();
|
||||
bool correctly_serialize_non_compound_range_tombstones = supports_correct_non_compound_range_tombstones();
|
||||
bool correctly_serialize_static_compact_in_mc = supports_correct_static_compact_in_mc();
|
||||
db::large_partition_handler* large_partition_handler;
|
||||
};
|
||||
|
||||
@@ -624,6 +626,13 @@ public:
|
||||
return has_scylla_component() && _components->scylla_metadata->has_feature(sstable_feature::ShadowableTombstones);
|
||||
}
|
||||
|
||||
sstable_enabled_features features() const {
|
||||
if (!has_scylla_component()) {
|
||||
return {};
|
||||
}
|
||||
return _components->scylla_metadata->get_features();
|
||||
}
|
||||
|
||||
bool has_correct_max_deletion_time() const {
|
||||
return (_version == sstable_version_types::mc) || has_scylla_component();
|
||||
}
|
||||
|
||||
@@ -410,16 +410,17 @@ struct serialization_header : public metadata_base<serialization_header> {
|
||||
}
|
||||
|
||||
// mc serialization header minimum values are delta-encoded based on the default timestamp epoch times
|
||||
// Note: following conversions rely on min_*_base.value being unsigned to prevent signed integer overflow
|
||||
api::timestamp_type get_min_timestamp() const {
|
||||
return static_cast<api::timestamp_type>(min_timestamp_base.value + encoding_stats::timestamp_epoch);
|
||||
}
|
||||
|
||||
int32_t get_min_ttl() const {
|
||||
return static_cast<int32_t>(min_ttl_base.value) + encoding_stats::ttl_epoch;
|
||||
return static_cast<int32_t>(min_ttl_base.value + encoding_stats::ttl_epoch);
|
||||
}
|
||||
|
||||
int32_t get_min_local_deletion_time() const {
|
||||
return static_cast<int32_t>(min_local_deletion_time_base.value) + encoding_stats::deletion_time_epoch;
|
||||
return static_cast<int32_t>(min_local_deletion_time_base.value + encoding_stats::deletion_time_epoch);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -455,7 +456,9 @@ enum sstable_feature : uint8_t {
|
||||
NonCompoundPIEntries = 0, // See #2993
|
||||
NonCompoundRangeTombstones = 1, // See #2986
|
||||
ShadowableTombstones = 2, // See #3885
|
||||
End = 4,
|
||||
CorrectStaticCompact = 3, // See #4139
|
||||
CorrectEmptyCounters = 4, // See #4363
|
||||
End = 5,
|
||||
};
|
||||
|
||||
// Scylla-specific features enabled for a particular sstable.
|
||||
@@ -504,9 +507,15 @@ struct scylla_metadata {
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::ExtensionAttributes, extension_attributes>
|
||||
> data;
|
||||
|
||||
bool has_feature(sstable_feature f) const {
|
||||
sstable_enabled_features get_features() const {
|
||||
auto features = data.get<scylla_metadata_type::Features, sstable_enabled_features>();
|
||||
return features && features->is_enabled(f);
|
||||
if (!features) {
|
||||
return sstable_enabled_features{};
|
||||
}
|
||||
return *features;
|
||||
}
|
||||
bool has_feature(sstable_feature f) const {
|
||||
return get_features().is_enabled(f);
|
||||
}
|
||||
const extension_attributes* get_extension_attributes() const {
|
||||
return data.get<scylla_metadata_type::ExtensionAttributes, extension_attributes>();
|
||||
|
||||
@@ -136,6 +136,9 @@ public:
|
||||
, _full_checksum(full_file_checksum)
|
||||
{}
|
||||
|
||||
virtual temporary_buffer<char> allocate_buffer(size_t size) override {
|
||||
return _out.allocate_buffer(size); // preserve alignment requirements
|
||||
}
|
||||
future<> put(net::packet data) { abort(); }
|
||||
virtual future<> put(temporary_buffer<char> buf) override {
|
||||
// bufs will usually be a multiple of chunk size, but this won't be the case for
|
||||
|
||||
@@ -292,7 +292,7 @@ void stream_manager::on_restart(inet_address endpoint, endpoint_state ep_state)
|
||||
}
|
||||
|
||||
void stream_manager::on_dead(inet_address endpoint, endpoint_state ep_state) {
|
||||
if (has_peer(endpoint) && ep_state.is_shutdown()) {
|
||||
if (has_peer(endpoint)) {
|
||||
sslog.info("stream_manager: Close all stream_session with peer = {} in on_dead", endpoint);
|
||||
get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
|
||||
sm.fail_sessions(endpoint);
|
||||
|
||||
33
streaming/stream_mutation_fragments_cmd.hh
Normal file
33
streaming/stream_mutation_fragments_cmd.hh
Normal file
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (C) 2019 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace streaming {
|
||||
|
||||
enum class stream_mutation_fragments_cmd : uint8_t {
|
||||
error,
|
||||
mutation_fragment_data,
|
||||
end_of_stream,
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
@@ -63,6 +63,7 @@
|
||||
#include "db/system_keyspace.hh"
|
||||
#include <boost/algorithm/cxx11/any_of.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
#include "streaming/stream_mutation_fragments_cmd.hh"
|
||||
|
||||
namespace streaming {
|
||||
|
||||
@@ -214,22 +215,52 @@ void stream_session::init_messaging_service_handler() {
|
||||
});
|
||||
});
|
||||
});
|
||||
ms().register_stream_mutation_fragments([] (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source) {
|
||||
ms().register_stream_mutation_fragments([] (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>> source) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
auto reason = reason_opt ? *reason_opt: stream_reason::unspecified;
|
||||
sslog.trace("Got stream_mutation_fragments from {} reason {}", from, int(reason));
|
||||
if (!_sys_dist_ks->local_is_initialized() || !_view_update_generator->local_is_initialized()) {
|
||||
return make_exception_future<rpc::sink<int>>(std::runtime_error(format("Node {} is not fully initialized for streaming, try again later",
|
||||
utils::fb_utilities::get_broadcast_address())));
|
||||
}
|
||||
return with_scheduling_group(service::get_local_storage_service().db().local().get_streaming_scheduling_group(), [from, estimated_partitions, plan_id, schema_id, cf_id, source, reason] () mutable {
|
||||
return service::get_schema_for_write(schema_id, from).then([from, estimated_partitions, plan_id, schema_id, cf_id, source, reason] (schema_ptr s) mutable {
|
||||
auto sink = ms().make_sink_for_stream_mutation_fragments(source);
|
||||
auto get_next_mutation_fragment = [source, plan_id, from, s] () mutable {
|
||||
return source().then([plan_id, from, s] (stdx::optional<std::tuple<frozen_mutation_fragment>> fmf_opt) mutable {
|
||||
if (fmf_opt) {
|
||||
frozen_mutation_fragment& fmf = std::get<0>(fmf_opt.value());
|
||||
struct stream_mutation_fragments_cmd_status {
|
||||
bool got_cmd = false;
|
||||
bool got_end_of_stream = false;
|
||||
};
|
||||
auto cmd_status = make_lw_shared<stream_mutation_fragments_cmd_status>();
|
||||
auto get_next_mutation_fragment = [source, plan_id, from, s, cmd_status] () mutable {
|
||||
return source().then([plan_id, from, s, cmd_status] (stdx::optional<std::tuple<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>>> opt) mutable {
|
||||
if (opt) {
|
||||
auto cmd = std::get<1>(*opt);
|
||||
if (cmd) {
|
||||
cmd_status->got_cmd = true;
|
||||
switch (*cmd) {
|
||||
case stream_mutation_fragments_cmd::mutation_fragment_data:
|
||||
break;
|
||||
case stream_mutation_fragments_cmd::error:
|
||||
return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender failed"));
|
||||
case stream_mutation_fragments_cmd::end_of_stream:
|
||||
cmd_status->got_end_of_stream = true;
|
||||
return make_ready_future<mutation_fragment_opt>();
|
||||
default:
|
||||
return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender sent wrong cmd"));
|
||||
}
|
||||
}
|
||||
frozen_mutation_fragment& fmf = std::get<0>(*opt);
|
||||
auto sz = fmf.representation().size();
|
||||
auto mf = fmf.unfreeze(*s);
|
||||
streaming::get_local_stream_manager().update_progress(plan_id, from.addr, progress_info::direction::IN, sz);
|
||||
return make_ready_future<mutation_fragment_opt>(std::move(mf));
|
||||
} else {
|
||||
// If the sender has sent stream_mutation_fragments_cmd it means it is
|
||||
// a node that understands the new protocol. It must send end_of_stream
|
||||
// before close the stream.
|
||||
if (cmd_status->got_cmd && !cmd_status->got_end_of_stream) {
|
||||
return make_exception_future<mutation_fragment_opt>(std::runtime_error("Sender did not sent end_of_stream"));
|
||||
}
|
||||
return make_ready_future<mutation_fragment_opt>();
|
||||
}
|
||||
});
|
||||
@@ -644,8 +675,7 @@ void stream_session::close_session(stream_session_state final_state) {
|
||||
_stream_result->handle_session_complete(shared_from_this());
|
||||
}
|
||||
|
||||
sslog.debug("[Stream #{}] close_session session={}, state={}, cancel keep_alive timer", plan_id(), this, final_state);
|
||||
_keep_alive.cancel();
|
||||
sslog.debug("[Stream #{}] close_session session={}, state={}", plan_id(), this, final_state);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -672,41 +702,6 @@ bool stream_session::is_initialized() const {
|
||||
|
||||
void stream_session::init(shared_ptr<stream_result_future> stream_result_) {
|
||||
_stream_result = stream_result_;
|
||||
_keep_alive.set_callback([this] {
|
||||
auto plan_id = this->plan_id();
|
||||
auto peer = this->peer;
|
||||
get_local_stream_manager().get_progress_on_all_shards(plan_id, peer).then([this, peer, plan_id] (stream_bytes sbytes) {
|
||||
if (this->_is_aborted) {
|
||||
sslog.info("[Stream #{}] The session {} is closed, keep alive timer will do nothing", plan_id, this);
|
||||
return;
|
||||
}
|
||||
auto now = lowres_clock::now();
|
||||
sslog.debug("[Stream #{}] keep alive timer callback sbytes old: tx={}, rx={} new: tx={} rx={}",
|
||||
plan_id, this->_last_stream_bytes.bytes_sent, this->_last_stream_bytes.bytes_received,
|
||||
sbytes.bytes_sent, sbytes.bytes_received);
|
||||
if (sbytes.bytes_sent > this->_last_stream_bytes.bytes_sent ||
|
||||
sbytes.bytes_received > this->_last_stream_bytes.bytes_received) {
|
||||
sslog.debug("[Stream #{}] The session {} made progress with peer {}", plan_id, this, peer);
|
||||
// Progress has been made
|
||||
this->_last_stream_bytes = sbytes;
|
||||
this->_last_stream_progress = now;
|
||||
this->start_keep_alive_timer();
|
||||
} else if (now - this->_last_stream_progress >= this->_keep_alive_timeout) {
|
||||
// Timeout
|
||||
sslog.info("[Stream #{}] The session {} is idle for {} seconds, the peer {} is probably gone, close it",
|
||||
plan_id, this, this->_keep_alive_timeout.count(), peer);
|
||||
this->on_error();
|
||||
} else {
|
||||
// Start the timer to check again
|
||||
sslog.info("[Stream #{}] The session {} made no progress with peer {}", plan_id, this, peer);
|
||||
this->start_keep_alive_timer();
|
||||
}
|
||||
}).handle_exception([plan_id, peer, session = this->shared_from_this()] (auto ep) {
|
||||
sslog.info("[Stream #{}] keep alive timer callback fails with peer {}: {}", plan_id, peer, ep);
|
||||
});
|
||||
});
|
||||
_last_stream_progress = lowres_clock::now();
|
||||
start_keep_alive_timer();
|
||||
}
|
||||
|
||||
utils::UUID stream_session::plan_id() {
|
||||
|
||||
@@ -180,14 +180,6 @@ private:
|
||||
bool _complete_sent = false;
|
||||
bool _received_failed_complete_message = false;
|
||||
|
||||
// If the session is idle for 10 minutes, close the session
|
||||
std::chrono::seconds _keep_alive_timeout{60 * 10};
|
||||
// Check every 1 minutes
|
||||
std::chrono::seconds _keep_alive_interval{60};
|
||||
timer<lowres_clock> _keep_alive;
|
||||
stream_bytes _last_stream_bytes;
|
||||
lowres_clock::time_point _last_stream_progress;
|
||||
|
||||
session_info _session_info;
|
||||
|
||||
stream_reason _reason = stream_reason::unspecified;
|
||||
@@ -198,9 +190,6 @@ public:
|
||||
void set_reason(stream_reason reason) {
|
||||
_reason = reason;
|
||||
}
|
||||
void start_keep_alive_timer() {
|
||||
_keep_alive.rearm(lowres_clock::now() + _keep_alive_interval);
|
||||
}
|
||||
|
||||
void add_bytes_sent(int64_t bytes) {
|
||||
_bytes_sent += bytes;
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include "streaming/stream_session.hh"
|
||||
#include "streaming/stream_manager.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
#include "streaming/stream_mutation_fragments_cmd.hh"
|
||||
#include "mutation_reader.hh"
|
||||
#include "frozen_mutation.hh"
|
||||
#include "mutation.hh"
|
||||
@@ -104,6 +105,21 @@ struct send_info {
|
||||
, prs(to_partition_ranges(ranges))
|
||||
, reader(cf.make_streaming_reader(cf.schema(), prs)) {
|
||||
}
|
||||
future<bool> has_relevant_range_on_this_shard() {
|
||||
return do_with(false, [this] (bool& found_relevant_range) {
|
||||
return do_for_each(ranges, [this, &found_relevant_range] (dht::token_range range) {
|
||||
if (!found_relevant_range) {
|
||||
auto sharder = dht::selective_token_range_sharder(range, engine().cpu_id());
|
||||
auto range_shard = sharder.next();
|
||||
if (range_shard) {
|
||||
found_relevant_range = true;
|
||||
}
|
||||
}
|
||||
}).then([&found_relevant_range] {
|
||||
return found_relevant_range;
|
||||
});
|
||||
});
|
||||
}
|
||||
future<size_t> estimate_partitions() {
|
||||
return do_with(cf.get_sstables(), size_t(0), [this] (auto& sstables, size_t& partition_count) {
|
||||
return do_for_each(*sstables, [this, &partition_count] (auto& sst) {
|
||||
@@ -160,7 +176,7 @@ future<> send_mutations(lw_shared_ptr<send_info> si) {
|
||||
future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
|
||||
return si->estimate_partitions().then([si] (size_t estimated_partitions) {
|
||||
sslog.info("[Stream #{}] Start sending ks={}, cf={}, estimated_partitions={}, with new rpc streaming", si->plan_id, si->cf.schema()->ks_name(), si->cf.schema()->cf_name(), estimated_partitions);
|
||||
return netw::get_local_messaging_service().make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->id).then([si] (rpc::sink<frozen_mutation_fragment> sink, rpc::source<int32_t> source) mutable {
|
||||
return netw::get_local_messaging_service().make_sink_and_source_for_stream_mutation_fragments(si->reader.schema()->version(), si->plan_id, si->cf_id, estimated_partitions, si->reason, si->id).then([si] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd> sink, rpc::source<int32_t> source) mutable {
|
||||
auto got_error_from_peer = make_lw_shared<bool>(false);
|
||||
|
||||
auto source_op = [source, got_error_from_peer, si] () mutable -> future<> {
|
||||
@@ -183,18 +199,25 @@ future<> send_mutation_fragments(lw_shared_ptr<send_info> si) {
|
||||
}();
|
||||
|
||||
auto sink_op = [sink, si, got_error_from_peer] () mutable -> future<> {
|
||||
return do_with(std::move(sink), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment>& sink) {
|
||||
return do_with(std::move(sink), [si, got_error_from_peer] (rpc::sink<frozen_mutation_fragment, stream_mutation_fragments_cmd>& sink) {
|
||||
return repeat([&sink, si, got_error_from_peer] () mutable {
|
||||
return si->reader(db::no_timeout).then([&sink, si, s = si->reader.schema(), got_error_from_peer] (mutation_fragment_opt mf) mutable {
|
||||
if (mf && !(*got_error_from_peer)) {
|
||||
frozen_mutation_fragment fmf = freeze(*s, *mf);
|
||||
auto size = fmf.representation().size();
|
||||
streaming::get_local_stream_manager().update_progress(si->plan_id, si->id.addr, streaming::progress_info::direction::OUT, size);
|
||||
return sink(fmf).then([] { return stop_iteration::no; });
|
||||
return sink(fmf, stream_mutation_fragments_cmd::mutation_fragment_data).then([] { return stop_iteration::no; });
|
||||
} else {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
}).then([&sink] () mutable {
|
||||
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::end_of_stream);
|
||||
}).handle_exception([&sink] (std::exception_ptr ep) mutable {
|
||||
// Notify the receiver the sender has failed
|
||||
return sink(frozen_mutation_fragment(bytes_ostream()), stream_mutation_fragments_cmd::error).then([ep = std::move(ep)] () mutable {
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}).finally([&sink] () mutable {
|
||||
return sink.close();
|
||||
});
|
||||
@@ -221,11 +244,18 @@ future<> stream_transfer_task::execute() {
|
||||
auto reason = session->get_reason();
|
||||
return session->get_db().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, streaming_with_rpc_stream, reason] (database& db) {
|
||||
auto si = make_lw_shared<send_info>(db, plan_id, cf_id, std::move(ranges), id, dst_cpu_id, reason);
|
||||
if (streaming_with_rpc_stream) {
|
||||
return send_mutation_fragments(std::move(si));
|
||||
} else {
|
||||
return send_mutations(std::move(si));
|
||||
}
|
||||
return si->has_relevant_range_on_this_shard().then([si, plan_id, cf_id, streaming_with_rpc_stream] (bool has_relevant_range_on_this_shard) {
|
||||
if (!has_relevant_range_on_this_shard) {
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
|
||||
plan_id, cf_id, engine().cpu_id());
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (streaming_with_rpc_stream) {
|
||||
return send_mutation_fragments(std::move(si));
|
||||
} else {
|
||||
return send_mutations(std::move(si));
|
||||
}
|
||||
});
|
||||
}).then([this, plan_id, cf_id, id, streaming_with_rpc_stream] {
|
||||
sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
|
||||
return session->ms().send_stream_mutation_done(id, plan_id, _ranges,
|
||||
@@ -235,7 +265,6 @@ future<> stream_transfer_task::execute() {
|
||||
});
|
||||
}).then([this, id, plan_id, cf_id] {
|
||||
sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
|
||||
session->start_keep_alive_timer();
|
||||
}).handle_exception([this, plan_id, id] (auto ep){
|
||||
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
|
||||
std::rethrow_exception(ep);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user