Compare commits
174 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b252bba4a2 | ||
|
|
a0b9fcc041 | ||
|
|
35c9b675c1 | ||
|
|
d71836fef7 | ||
|
|
f8e150e97c | ||
|
|
10c300f894 | ||
|
|
de1d3e5c6b | ||
|
|
69810c13ca | ||
|
|
9b025a5742 | ||
|
|
74eebc4cab | ||
|
|
9b2ca4ee44 | ||
|
|
773bf45774 | ||
|
|
c6705b4335 | ||
|
|
3997871b4d | ||
|
|
4ff1d731bd | ||
|
|
0e0f9143c9 | ||
|
|
9d809d6ea4 | ||
|
|
630d599c34 | ||
|
|
0933c1a00a | ||
|
|
7a7099fcfb | ||
|
|
50235aacb4 | ||
|
|
e888009f12 | ||
|
|
a19615ee9b | ||
|
|
357ca67fda | ||
|
|
7818c63eb1 | ||
|
|
da10eae18c | ||
|
|
d5292cd3ec | ||
|
|
9cb35361d9 | ||
|
|
3e285248be | ||
|
|
6f10ccb441 | ||
|
|
df420499bc | ||
|
|
d29527b4e1 | ||
|
|
8a90e242e4 | ||
|
|
8a78c0aba9 | ||
|
|
8a2bbcf138 | ||
|
|
22c891e6df | ||
|
|
1841d0c2d9 | ||
|
|
e10107fe5a | ||
|
|
0b3a4679db | ||
|
|
ba60d666a9 | ||
|
|
6ea4d0b75c | ||
|
|
8c5911f312 | ||
|
|
de00d7f5a1 | ||
|
|
e5f9dae4bb | ||
|
|
e13e796290 | ||
|
|
336c771663 | ||
|
|
82968afc25 | ||
|
|
383dcffb53 | ||
|
|
0c2abc007c | ||
|
|
1498c4f150 | ||
|
|
f388992a94 | ||
|
|
310540c11f | ||
|
|
7d833023cc | ||
|
|
d94ac196e0 | ||
|
|
1d7430995e | ||
|
|
b662a7f8a4 | ||
|
|
447ad72882 | ||
|
|
b8485d3bce | ||
|
|
034b0f50db | ||
|
|
12ec0becf3 | ||
|
|
666b19552d | ||
|
|
178f870a03 | ||
|
|
1b18f16dc1 | ||
|
|
28934575e4 | ||
|
|
182cbeefb0 | ||
|
|
b70fc41a90 | ||
|
|
debfc795b2 | ||
|
|
0d094575ec | ||
|
|
20baef69a9 | ||
|
|
1bac88601d | ||
|
|
e581fd1463 | ||
|
|
b366bff998 | ||
|
|
38e6984ba5 | ||
|
|
332f76579e | ||
|
|
315a03cf6c | ||
|
|
1847dc7a6a | ||
|
|
dd11b5987e | ||
|
|
a134e8699a | ||
|
|
bd7dcbb8d2 | ||
|
|
74e61528a6 | ||
|
|
5eb4fde2d5 | ||
|
|
cc0703f8ca | ||
|
|
678283a5bb | ||
|
|
552c0d7641 | ||
|
|
860c06660b | ||
|
|
db733ba075 | ||
|
|
88677d39c8 | ||
|
|
d767dee5ec | ||
|
|
702f6ee1b7 | ||
|
|
473b9aec65 | ||
|
|
b548061257 | ||
|
|
01165a9ae7 | ||
|
|
5cdb963768 | ||
|
|
7c9b9a4e24 | ||
|
|
f475c65ae6 | ||
|
|
687372bc48 | ||
|
|
65c140121c | ||
|
|
ed68ad220f | ||
|
|
35f4b8fbbe | ||
|
|
48012fe418 | ||
|
|
c862ccda91 | ||
|
|
83b1057c4b | ||
|
|
c1cb779dd2 | ||
|
|
b47d18f9fd | ||
|
|
f8713b019e | ||
|
|
cd5e4eace5 | ||
|
|
4fb5403670 | ||
|
|
e9df6c42ce | ||
|
|
5fdf492ccc | ||
|
|
fd2b02a12c | ||
|
|
f8cec2f891 | ||
|
|
e4d6577ef2 | ||
|
|
346027248d | ||
|
|
2cf6191353 | ||
|
|
b52d647de2 | ||
|
|
f7c96a37f1 | ||
|
|
ae71ffdcfd | ||
|
|
a235900388 | ||
|
|
be9f150341 | ||
|
|
2478fa1f6e | ||
|
|
d95ac1826e | ||
|
|
6fc17345e9 | ||
|
|
4bfa0ae247 | ||
|
|
174b7870e6 | ||
|
|
e95b4ee825 | ||
|
|
464305de1c | ||
|
|
3a1a9e1a11 | ||
|
|
90dac5d944 | ||
|
|
e5a83d105c | ||
|
|
9b4a0a2879 | ||
|
|
adad12ddc3 | ||
|
|
a77bb1fe34 | ||
|
|
3c7e6dfdb9 | ||
|
|
fab136ae1d | ||
|
|
a4218f536b | ||
|
|
9f4431ef04 | ||
|
|
66250bf8cc | ||
|
|
88fe3c2694 | ||
|
|
db4c3d3e52 | ||
|
|
ca22a1cd1a | ||
|
|
f9b702764e | ||
|
|
54701bd95c | ||
|
|
30eca5f534 | ||
|
|
cd057d3882 | ||
|
|
c5a5a2265e | ||
|
|
3e482c6c9d | ||
|
|
5b6cadb890 | ||
|
|
9cf8cd6c02 | ||
|
|
b34567b69b | ||
|
|
02b763ed97 | ||
|
|
05500a52d7 | ||
|
|
4afa558e97 | ||
|
|
f3956421f7 | ||
|
|
a17a6ce8f5 | ||
|
|
58a362c1f2 | ||
|
|
361b2dd7a5 | ||
|
|
f6a2bafae2 | ||
|
|
2ec25a55cd | ||
|
|
d3fb7c5515 | ||
|
|
b1ac6a36f2 | ||
|
|
8cba125bce | ||
|
|
f46f9f7533 | ||
|
|
090d991f8e | ||
|
|
ae15a80d01 | ||
|
|
6cf902343a | ||
|
|
d5e59f671c | ||
|
|
38944655c5 | ||
|
|
06e274ff34 | ||
|
|
c24d4a8acb | ||
|
|
5f95b76c65 | ||
|
|
0bdb7e1e7c | ||
|
|
56ea4f3154 | ||
|
|
d9c178063c | ||
|
|
b21b7f73b9 |
6
.gitmodules
vendored
6
.gitmodules
vendored
@@ -6,9 +6,9 @@
|
||||
path = swagger-ui
|
||||
url = ../scylla-swagger-ui
|
||||
ignore = dirty
|
||||
[submodule "dist/ami/files/scylla-ami"]
|
||||
path = dist/ami/files/scylla-ami
|
||||
url = ../scylla-ami
|
||||
[submodule "xxHash"]
|
||||
path = xxHash
|
||||
url = ../xxHash
|
||||
[submodule "libdeflate"]
|
||||
path = libdeflate
|
||||
url = ../libdeflate
|
||||
|
||||
@@ -138,5 +138,4 @@ target_include_directories(scylla PUBLIC
|
||||
${SEASTAR_INCLUDE_DIRS}
|
||||
${Boost_INCLUDE_DIRS}
|
||||
xxhash
|
||||
libdeflate
|
||||
build/release/gen)
|
||||
|
||||
@@ -20,7 +20,7 @@ $ git submodule update --init --recursive
|
||||
|
||||
Scylla depends on the system package manager for its development dependencies.
|
||||
|
||||
Running `./install-dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.
|
||||
Running `./install_dependencies.sh` (as root) installs the appropriate packages based on your Linux distribution.
|
||||
|
||||
### Build system
|
||||
|
||||
|
||||
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
|
||||
./dist/redhat/build_rpm.sh
|
||||
```
|
||||
|
||||
The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
|
||||
The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
|
||||
For example, on Fedora 21 mock reports the following:
|
||||
|
||||
```
|
||||
INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
|
||||
INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
|
||||
INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
|
||||
```
|
||||
|
||||
## Building Fedora-based Docker image
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=3.0.11
|
||||
VERSION=2.3.6
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -78,17 +78,15 @@ void set_storage_service(http_context& ctx, routes& r) {
|
||||
});
|
||||
});
|
||||
|
||||
ss::get_tokens.set(r, [] (std::unique_ptr<request> req) {
|
||||
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().sorted_tokens(), [](const dht::token& i) {
|
||||
return boost::lexical_cast<std::string>(i);
|
||||
}));
|
||||
ss::get_tokens.set(r, [] (const_req req) {
|
||||
auto tokens = service::get_local_storage_service().get_token_metadata().sorted_tokens();
|
||||
return container_to_vec(tokens);
|
||||
});
|
||||
|
||||
ss::get_node_tokens.set(r, [] (std::unique_ptr<request> req) {
|
||||
gms::inet_address addr(req->param["endpoint"]);
|
||||
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().get_token_metadata().get_tokens(addr), [](const dht::token& i) {
|
||||
return boost::lexical_cast<std::string>(i);
|
||||
}));
|
||||
ss::get_node_tokens.set(r, [] (const_req req) {
|
||||
gms::inet_address addr(req.param["endpoint"]);
|
||||
auto tokens = service::get_local_storage_service().get_token_metadata().get_tokens(addr);
|
||||
return container_to_vec(tokens);
|
||||
});
|
||||
|
||||
ss::get_commitlog.set(r, [&ctx](const_req req) {
|
||||
@@ -109,7 +107,11 @@ void set_storage_service(http_context& ctx, routes& r) {
|
||||
});
|
||||
|
||||
ss::get_moving_nodes.set(r, [](const_req req) {
|
||||
auto points = service::get_local_storage_service().get_token_metadata().get_moving_endpoints();
|
||||
std::unordered_set<sstring> addr;
|
||||
for (auto i: points) {
|
||||
addr.insert(boost::lexical_cast<std::string>(i.second));
|
||||
}
|
||||
return container_to_vec(addr);
|
||||
});
|
||||
|
||||
|
||||
@@ -47,23 +47,6 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value, atomic_cell::collection_member cm) {
|
||||
auto& imr_data = type.imr_state();
|
||||
return atomic_cell(
|
||||
imr_data.type_info(),
|
||||
imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value, collection_member cm)
|
||||
{
|
||||
auto& imr_data = type.imr_state();
|
||||
return atomic_cell(
|
||||
imr_data.type_info(),
|
||||
imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
|
||||
auto& imr_data = type.imr_state();
|
||||
@@ -73,25 +56,6 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
|
||||
auto& imr_data = type.imr_state();
|
||||
return atomic_cell(
|
||||
imr_data.type_info(),
|
||||
imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm)
|
||||
{
|
||||
auto& imr_data = type.imr_state();
|
||||
return atomic_cell(
|
||||
imr_data.type_info(),
|
||||
imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
|
||||
auto& imr_data = no_type_imr_descriptor();
|
||||
return atomic_cell(
|
||||
|
||||
@@ -33,9 +33,6 @@
|
||||
#include "data/cell.hh"
|
||||
#include "data/schema_info.hh"
|
||||
#include "imr/utils.hh"
|
||||
#include "utils/fragmented_temporary_buffer.hh"
|
||||
|
||||
#include "serializer.hh"
|
||||
|
||||
class abstract_type;
|
||||
class collection_type_impl;
|
||||
@@ -189,10 +186,6 @@ public:
|
||||
static atomic_cell make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
|
||||
collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
|
||||
collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
|
||||
collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
|
||||
collection_member cm = collection_member::no) {
|
||||
return make_live(type, timestamp, bytes_view(value), cm);
|
||||
@@ -200,10 +193,6 @@ public:
|
||||
static atomic_cell make_live_counter_update(api::timestamp_type timestamp, int64_t value);
|
||||
static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, bytes_view value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, const fragmented_temporary_buffer::view& value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm = collection_member::no)
|
||||
{
|
||||
|
||||
@@ -87,17 +87,11 @@ future<> create_metadata_table_if_missing(
|
||||
return mm.announce_new_column_family(b.build(), false);
|
||||
}
|
||||
|
||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
|
||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
|
||||
static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
|
||||
|
||||
return do_until([&db, &as] {
|
||||
as.check();
|
||||
return db.get_version() != database::empty_version;
|
||||
}, pause).then([&mm, &as] {
|
||||
return do_until([&mm, &as] {
|
||||
as.check();
|
||||
return mm.have_schema_agreement();
|
||||
}, pause);
|
||||
return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
|
||||
return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
|
||||
stdx::string_view cql,
|
||||
::service::migration_manager&);
|
||||
|
||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);
|
||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
|
||||
|
||||
///
|
||||
/// Time-outs for internal, non-local CQL queries.
|
||||
|
||||
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
|
||||
_migration_manager).then([this] {
|
||||
_finished = do_after_system_ready(_as, [this] {
|
||||
return async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
|
||||
if (legacy_metadata_exists()) {
|
||||
if (!any_granted().get0()) {
|
||||
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {
|
||||
|
||||
future<> default_authorizer::stop() {
|
||||
_as.request_abort();
|
||||
return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
|
||||
return _finished.handle_exception_type([](const sleep_aborted&) {});
|
||||
}
|
||||
|
||||
future<permission_set>
|
||||
|
||||
@@ -41,6 +41,11 @@
|
||||
|
||||
#include "auth/password_authenticator.hh"
|
||||
|
||||
extern "C" {
|
||||
#include <crypt.h>
|
||||
#include <unistd.h>
|
||||
}
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <random>
|
||||
@@ -50,7 +55,6 @@
|
||||
|
||||
#include "auth/authenticated_user.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "auth/passwords.hh"
|
||||
#include "auth/roles-metadata.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "log.hh"
|
||||
@@ -78,8 +82,6 @@ static const class_registrator<
|
||||
cql3::query_processor&,
|
||||
::service::migration_manager&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
|
||||
|
||||
static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());
|
||||
|
||||
password_authenticator::~password_authenticator() {
|
||||
}
|
||||
|
||||
@@ -89,6 +91,78 @@ password_authenticator::password_authenticator(cql3::query_processor& qp, ::serv
|
||||
, _stopped(make_ready_future<>()) {
|
||||
}
|
||||
|
||||
// TODO: blowfish
|
||||
// Origin uses Java bcrypt library, i.e. blowfish salt
|
||||
// generation and hashing, which is arguably a "better"
|
||||
// password hash than sha/md5 versions usually available in
|
||||
// crypt_r. Otoh, glibc 2.7+ uses a modified sha512 algo
|
||||
// which should be the same order of safe, so the only
|
||||
// real issue should be salted hash compatibility with
|
||||
// origin if importing system tables from there.
|
||||
//
|
||||
// Since bcrypt/blowfish is _not_ (afaict) not available
|
||||
// as a dev package/lib on most linux distros, we'd have to
|
||||
// copy and compile for example OWL crypto
|
||||
// (http://cvsweb.openwall.com/cgi/cvsweb.cgi/Owl/packages/glibc/crypt_blowfish/)
|
||||
// to be fully bit-compatible.
|
||||
//
|
||||
// Until we decide this is needed, let's just use crypt_r,
|
||||
// and some old-fashioned random salt generation.
|
||||
|
||||
static constexpr size_t rand_bytes = 16;
|
||||
static thread_local crypt_data tlcrypt = { 0, };
|
||||
|
||||
static sstring hashpw(const sstring& pass, const sstring& salt) {
|
||||
auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
|
||||
if (res == nullptr) {
|
||||
throw std::system_error(errno, std::system_category());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool checkpw(const sstring& pass, const sstring& salted_hash) {
|
||||
auto tmp = hashpw(pass, salted_hash);
|
||||
return tmp == salted_hash;
|
||||
}
|
||||
|
||||
static sstring gensalt() {
|
||||
static sstring prefix;
|
||||
|
||||
std::random_device rd;
|
||||
std::default_random_engine e1(rd());
|
||||
std::uniform_int_distribution<char> dist;
|
||||
|
||||
sstring valid_salt = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./";
|
||||
sstring input(rand_bytes, 0);
|
||||
|
||||
for (char&c : input) {
|
||||
c = valid_salt[dist(e1) % valid_salt.size()];
|
||||
}
|
||||
|
||||
sstring salt;
|
||||
|
||||
if (!prefix.empty()) {
|
||||
return prefix + input;
|
||||
}
|
||||
|
||||
// Try in order:
|
||||
// blowfish 2011 fix, blowfish, sha512, sha256, md5
|
||||
for (sstring pfx : { "$2y$", "$2a$", "$6$", "$5$", "$1$" }) {
|
||||
salt = pfx + input;
|
||||
const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
|
||||
|
||||
if (e && (e[0] != '*')) {
|
||||
prefix = pfx;
|
||||
return salt;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Could not initialize hashing algorithm");
|
||||
}
|
||||
|
||||
static sstring hashpw(const sstring& pass) {
|
||||
return hashpw(pass, gensalt());
|
||||
}
|
||||
|
||||
static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
|
||||
return !row.get_or<sstring>(SALTED_HASH, "").empty();
|
||||
}
|
||||
@@ -138,7 +212,7 @@ future<> password_authenticator::create_default_if_missing() const {
|
||||
update_row_query,
|
||||
db::consistency_level::QUORUM,
|
||||
internal_distributed_timeout_config(),
|
||||
{passwords::hash(DEFAULT_USER_PASSWORD, rng_for_salt), DEFAULT_USER_NAME}).then([](auto&&) {
|
||||
{hashpw(DEFAULT_USER_PASSWORD), DEFAULT_USER_NAME}).then([](auto&&) {
|
||||
plogger.info("Created default superuser authentication record.");
|
||||
});
|
||||
}
|
||||
@@ -149,6 +223,8 @@ future<> password_authenticator::create_default_if_missing() const {
|
||||
|
||||
future<> password_authenticator::start() {
|
||||
return once_among_shards([this] {
|
||||
gensalt(); // do this once to determine usable hashing
|
||||
|
||||
auto f = create_metadata_table_if_missing(
|
||||
meta::roles_table::name,
|
||||
_qp,
|
||||
@@ -157,7 +233,7 @@ future<> password_authenticator::start() {
|
||||
|
||||
_stopped = do_after_system_ready(_as, [this] {
|
||||
return async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
|
||||
if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
|
||||
if (legacy_metadata_exists()) {
|
||||
@@ -182,7 +258,7 @@ future<> password_authenticator::start() {
|
||||
|
||||
future<> password_authenticator::stop() {
|
||||
_as.request_abort();
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
||||
}
|
||||
|
||||
db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
|
||||
@@ -245,7 +321,7 @@ future<authenticated_user> password_authenticator::authenticate(
|
||||
if (!res->empty()) {
|
||||
salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
|
||||
}
|
||||
if (!salted_hash || !passwords::check(password, *salted_hash)) {
|
||||
if (!salted_hash || !checkpw(password, *salted_hash)) {
|
||||
throw exceptions::authentication_exception("Username and/or password are incorrect");
|
||||
}
|
||||
return make_ready_future<authenticated_user>(username);
|
||||
@@ -268,7 +344,7 @@ future<> password_authenticator::create(stdx::string_view role_name, const authe
|
||||
update_row_query,
|
||||
consistency_for_user(role_name),
|
||||
internal_distributed_timeout_config(),
|
||||
{passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
|
||||
{hashpw(*options.password), sstring(role_name)}).discard_result();
|
||||
}
|
||||
|
||||
future<> password_authenticator::alter(stdx::string_view role_name, const authentication_options& options) const {
|
||||
@@ -286,7 +362,7 @@ future<> password_authenticator::alter(stdx::string_view role_name, const authen
|
||||
query,
|
||||
consistency_for_user(role_name),
|
||||
internal_distributed_timeout_config(),
|
||||
{passwords::hash(*options.password, rng_for_salt), sstring(role_name)}).discard_result();
|
||||
{hashpw(*options.password), sstring(role_name)}).discard_result();
|
||||
}
|
||||
|
||||
future<> password_authenticator::drop(stdx::string_view name) const {
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "auth/passwords.hh"
|
||||
|
||||
#include <cerrno>
|
||||
#include <optional>
|
||||
|
||||
extern "C" {
|
||||
#include <crypt.h>
|
||||
#include <unistd.h>
|
||||
}
|
||||
|
||||
namespace auth::passwords {
|
||||
|
||||
static thread_local crypt_data tlcrypt = { 0, };
|
||||
|
||||
namespace detail {
|
||||
|
||||
scheme identify_best_supported_scheme() {
|
||||
const auto all_schemes = { scheme::bcrypt_y, scheme::bcrypt_a, scheme::sha_512, scheme::sha_256, scheme::md5 };
|
||||
// "Random", for testing schemes.
|
||||
const sstring random_part_of_salt = "aaaabbbbccccdddd";
|
||||
|
||||
for (scheme c : all_schemes) {
|
||||
const sstring salt = sstring(prefix_for_scheme(c)) + random_part_of_salt;
|
||||
const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
|
||||
|
||||
if (e && (e[0] != '*')) {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
throw no_supported_schemes();
|
||||
}
|
||||
|
||||
sstring hash_with_salt(const sstring& pass, const sstring& salt) {
|
||||
auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
|
||||
if (!res || (res[0] == '*')) {
|
||||
throw std::system_error(errno, std::system_category());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
const char* prefix_for_scheme(scheme c) noexcept {
|
||||
switch (c) {
|
||||
case scheme::bcrypt_y: return "$2y$";
|
||||
case scheme::bcrypt_a: return "$2a$";
|
||||
case scheme::sha_512: return "$6$";
|
||||
case scheme::sha_256: return "$5$";
|
||||
case scheme::md5: return "$1$";
|
||||
default: return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
no_supported_schemes::no_supported_schemes()
|
||||
: std::runtime_error("No allowed hashing schemes are supported on this system") {
|
||||
}
|
||||
|
||||
bool check(const sstring& pass, const sstring& salted_hash) {
|
||||
return detail::hash_with_salt(pass, salted_hash) == salted_hash;
|
||||
}
|
||||
|
||||
} // namespace auth::paswords
|
||||
@@ -1,125 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
|
||||
#include "seastarx.hh"
|
||||
|
||||
namespace auth::passwords {
|
||||
|
||||
class no_supported_schemes : public std::runtime_error {
|
||||
public:
|
||||
no_supported_schemes();
|
||||
};
|
||||
|
||||
///
|
||||
/// Apache Cassandra uses a library to provide the bcrypt scheme. Many Linux implementations do not support bcrypt, so
|
||||
/// we support alternatives. The cost is loss of direct compatibility with Apache Cassandra system tables.
|
||||
///
|
||||
enum class scheme {
|
||||
bcrypt_y,
|
||||
bcrypt_a,
|
||||
sha_512,
|
||||
sha_256,
|
||||
md5
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename RandomNumberEngine>
|
||||
sstring generate_random_salt_bytes(RandomNumberEngine& g) {
|
||||
static const sstring valid_bytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789./";
|
||||
static constexpr std::size_t num_bytes = 16;
|
||||
std::uniform_int_distribution<std::size_t> dist(0, valid_bytes.size() - 1);
|
||||
sstring result(num_bytes, 0);
|
||||
|
||||
for (char& c : result) {
|
||||
c = valid_bytes[dist(g)];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
///
|
||||
/// Test each allowed hashing scheme and report the best supported one on the current system.
|
||||
///
|
||||
/// \throws \ref no_supported_schemes when none of the known schemes is supported.
|
||||
///
|
||||
scheme identify_best_supported_scheme();
|
||||
|
||||
const char* prefix_for_scheme(scheme) noexcept;
|
||||
|
||||
///
|
||||
/// Generate a implementation-specific salt string for hashing passwords.
|
||||
///
|
||||
/// The `RandomNumberEngine` is used to generate the string, which is an implementation-specific length.
|
||||
///
|
||||
/// \throws \ref no_supported_schemes when no known hashing schemes are supported on the system.
|
||||
///
|
||||
template <typename RandomNumberEngine>
|
||||
sstring generate_salt(RandomNumberEngine& g) {
|
||||
static const scheme scheme = identify_best_supported_scheme();
|
||||
static const sstring prefix = sstring(prefix_for_scheme(scheme));
|
||||
return prefix + generate_random_salt_bytes(g);
|
||||
}
|
||||
|
||||
///
|
||||
/// Hash a password combined with an implementation-specific salt string.
|
||||
///
|
||||
/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
|
||||
///
|
||||
sstring hash_with_salt(const sstring& pass, const sstring& salt);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
///
|
||||
/// Run a one-way hashing function on cleartext to produce encrypted text.
|
||||
///
|
||||
/// Prior to applying the hashing function, random salt is amended to the cleartext. The random salt bytes are generated
|
||||
/// according to the random number engine `g`.
|
||||
///
|
||||
/// The result is the encrypted cyphertext, and also the salt used but in a implementation-specific format.
|
||||
///
|
||||
/// \throws \ref std::system_error when the implementation-specific implementation fails to hash the cleartext.
|
||||
///
|
||||
template <typename RandomNumberEngine>
|
||||
sstring hash(const sstring& pass, RandomNumberEngine& g) {
|
||||
return detail::hash_with_salt(pass, detail::generate_salt(g));
|
||||
}
|
||||
|
||||
///
|
||||
/// Check that cleartext matches previously hashed cleartext with salt.
|
||||
///
|
||||
/// \ref salted_hash is the result of invoking \ref hash, which is the implementation-specific combination of the hashed
|
||||
/// password and the salt that was generated for it.
|
||||
///
|
||||
/// \returns `true` if the cleartext matches the salted hash.
|
||||
///
|
||||
/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
|
||||
///
|
||||
bool check(const sstring& pass, const sstring& salted_hash);
|
||||
|
||||
} // namespace auth::passwords
|
||||
@@ -184,9 +184,7 @@ future<> service::start() {
|
||||
return once_among_shards([this] {
|
||||
return create_keyspace_if_missing();
|
||||
}).then([this] {
|
||||
return _role_manager->start().then([this] {
|
||||
return when_all_succeed(_authorizer->start(), _authenticator->start());
|
||||
});
|
||||
return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
|
||||
}).then([this] {
|
||||
_permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
|
||||
}).then([this] {
|
||||
|
||||
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
|
||||
return this->create_metadata_tables_if_missing().then([this] {
|
||||
_stopped = auth::do_after_system_ready(_as, [this] {
|
||||
return seastar::async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
|
||||
if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
|
||||
if (this->legacy_metadata_exists()) {
|
||||
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {
|
||||
|
||||
future<> standard_role_manager::stop() {
|
||||
_as.request_abort();
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
||||
}
|
||||
|
||||
future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
|
||||
|
||||
@@ -77,7 +77,7 @@ protected:
|
||||
, _io_priority(iop)
|
||||
, _interval(interval)
|
||||
, _update_timer([this] { adjust(); })
|
||||
, _control_points()
|
||||
, _control_points({{0,0}})
|
||||
, _current_backlog(std::move(backlog))
|
||||
, _inflight_update(make_ready_future<>())
|
||||
{
|
||||
@@ -125,7 +125,7 @@ public:
|
||||
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
||||
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
|
||||
: backlog_controller(sg, iop, std::move(interval),
|
||||
std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
|
||||
std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
|
||||
std::move(current_dirty)
|
||||
)
|
||||
{}
|
||||
@@ -139,7 +139,7 @@ public:
|
||||
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
||||
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
|
||||
: backlog_controller(sg, iop, std::move(interval),
|
||||
std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
|
||||
std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
|
||||
std::move(current_backlog)
|
||||
)
|
||||
{}
|
||||
|
||||
4
bytes.hh
4
bytes.hh
@@ -35,10 +35,6 @@ using bytes_mutable_view = basic_mutable_view<bytes_view::value_type>;
|
||||
using bytes_opt = std::experimental::optional<bytes>;
|
||||
using sstring_view = std::experimental::string_view;
|
||||
|
||||
inline sstring_view to_sstring_view(bytes_view view) {
|
||||
return {reinterpret_cast<const char*>(view.data()), view.size()};
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
|
||||
@@ -38,7 +38,7 @@ class bytes_ostream {
|
||||
public:
|
||||
using size_type = bytes::size_type;
|
||||
using value_type = bytes::value_type;
|
||||
static constexpr size_type max_chunk_size() { return 128 * 1024; }
|
||||
static constexpr size_type max_chunk_size() { return 16 * 1024; }
|
||||
private:
|
||||
static_assert(sizeof(value_type) == 1, "value_type is assumed to be one byte long");
|
||||
struct chunk {
|
||||
@@ -57,12 +57,12 @@ private:
|
||||
value_type data[0];
|
||||
void operator delete(void* ptr) { free(ptr); }
|
||||
};
|
||||
static constexpr size_type default_chunk_size{512};
|
||||
// FIXME: consider increasing chunk size as the buffer grows
|
||||
static constexpr size_type chunk_size{512};
|
||||
private:
|
||||
std::unique_ptr<chunk> _begin;
|
||||
chunk* _current;
|
||||
size_type _size;
|
||||
size_type _initial_chunk_size = default_chunk_size;
|
||||
public:
|
||||
class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
|
||||
chunk* _current = nullptr;
|
||||
@@ -102,13 +102,13 @@ private:
|
||||
}
|
||||
// Figure out next chunk size.
|
||||
// - must be enough for data_size
|
||||
// - must be at least _initial_chunk_size
|
||||
// - must be at least chunk_size
|
||||
// - try to double each time to prevent too many allocations
|
||||
// - do not exceed max_chunk_size
|
||||
size_type next_alloc_size(size_t data_size) const {
|
||||
auto next_size = _current
|
||||
? _current->size * 2
|
||||
: _initial_chunk_size;
|
||||
: chunk_size;
|
||||
next_size = std::min(next_size, max_chunk_size());
|
||||
// FIXME: check for overflow?
|
||||
return std::max<size_type>(next_size, data_size + sizeof(chunk));
|
||||
@@ -116,19 +116,13 @@ private:
|
||||
// Makes room for a contiguous region of given size.
|
||||
// The region is accounted for as already written.
|
||||
// size must not be zero.
|
||||
[[gnu::always_inline]]
|
||||
value_type* alloc(size_type size) {
|
||||
if (__builtin_expect(size <= current_space_left(), true)) {
|
||||
if (size <= current_space_left()) {
|
||||
auto ret = _current->data + _current->offset;
|
||||
_current->offset += size;
|
||||
_size += size;
|
||||
return ret;
|
||||
} else {
|
||||
return alloc_new(size);
|
||||
}
|
||||
}
|
||||
[[gnu::noinline]]
|
||||
value_type* alloc_new(size_type size) {
|
||||
auto alloc_size = next_alloc_size(size);
|
||||
auto space = malloc(alloc_size);
|
||||
if (!space) {
|
||||
@@ -146,22 +140,19 @@ private:
|
||||
}
|
||||
_size += size;
|
||||
return _current->data;
|
||||
};
|
||||
}
|
||||
public:
|
||||
explicit bytes_ostream(size_t initial_chunk_size) noexcept
|
||||
bytes_ostream() noexcept
|
||||
: _begin()
|
||||
, _current(nullptr)
|
||||
, _size(0)
|
||||
, _initial_chunk_size(initial_chunk_size)
|
||||
{ }
|
||||
|
||||
bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
|
||||
|
||||
bytes_ostream(bytes_ostream&& o) noexcept
|
||||
: _begin(std::move(o._begin))
|
||||
, _current(o._current)
|
||||
, _size(o._size)
|
||||
, _initial_chunk_size(o._initial_chunk_size)
|
||||
{
|
||||
o._current = nullptr;
|
||||
o._size = 0;
|
||||
@@ -171,7 +162,6 @@ public:
|
||||
: _begin()
|
||||
, _current(nullptr)
|
||||
, _size(0)
|
||||
, _initial_chunk_size(o._initial_chunk_size)
|
||||
{
|
||||
append(o);
|
||||
}
|
||||
@@ -209,20 +199,18 @@ public:
|
||||
return place_holder<T>{alloc(sizeof(T))};
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
value_type* write_place_holder(size_type size) {
|
||||
return alloc(size);
|
||||
}
|
||||
|
||||
// Writes given sequence of bytes
|
||||
[[gnu::always_inline]]
|
||||
inline void write(bytes_view v) {
|
||||
if (v.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto this_size = std::min(v.size(), size_t(current_space_left()));
|
||||
if (__builtin_expect(this_size, true)) {
|
||||
if (this_size) {
|
||||
memcpy(_current->data + _current->offset, v.begin(), this_size);
|
||||
_current->offset += this_size;
|
||||
_size += this_size;
|
||||
@@ -231,12 +219,11 @@ public:
|
||||
|
||||
while (!v.empty()) {
|
||||
auto this_size = std::min(v.size(), size_t(max_chunk_size()));
|
||||
std::copy_n(v.begin(), this_size, alloc_new(this_size));
|
||||
std::copy_n(v.begin(), this_size, alloc(this_size));
|
||||
v.remove_prefix(this_size);
|
||||
}
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
void write(const char* ptr, size_t size) {
|
||||
write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
|
||||
}
|
||||
@@ -406,21 +393,6 @@ public:
|
||||
bool operator!=(const bytes_ostream& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
// Makes this instance empty.
|
||||
//
|
||||
// The first buffer is not deallocated, so callers may rely on the
|
||||
// fact that if they write less than the initial chunk size between
|
||||
// the clear() calls then writes will not involve any memory allocations,
|
||||
// except for the first write made on this instance.
|
||||
void clear() {
|
||||
if (_begin) {
|
||||
_begin->offset = 0;
|
||||
_size = 0;
|
||||
_current = _begin.get();
|
||||
_begin->next.reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
|
||||
@@ -61,12 +61,11 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
||||
// - _last_row points at a direct predecessor of the next row which is going to be read.
|
||||
// Used for populating continuity.
|
||||
// - _population_range_starts_before_all_rows is set accordingly
|
||||
// - _underlying is engaged and fast-forwarded
|
||||
reading_from_underlying,
|
||||
|
||||
end_of_stream
|
||||
};
|
||||
partition_snapshot_ptr _snp;
|
||||
lw_shared_ptr<partition_snapshot> _snp;
|
||||
position_in_partition::tri_compare _position_cmp;
|
||||
|
||||
query::clustering_key_filter_ranges _ck_ranges;
|
||||
@@ -100,13 +99,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
||||
// forward progress is not guaranteed in case iterators are getting constantly invalidated.
|
||||
bool _lower_bound_changed = false;
|
||||
|
||||
// Points to the underlying reader conforming to _schema,
|
||||
// either to *_underlying_holder or _read_context->underlying().underlying().
|
||||
flat_mutation_reader* _underlying = nullptr;
|
||||
std::optional<flat_mutation_reader> _underlying_holder;
|
||||
|
||||
future<> do_fill_buffer(db::timeout_clock::time_point);
|
||||
future<> ensure_underlying(db::timeout_clock::time_point);
|
||||
void copy_from_cache_to_buffer();
|
||||
future<> process_static_row(db::timeout_clock::time_point);
|
||||
void move_to_end();
|
||||
@@ -144,7 +137,7 @@ public:
|
||||
dht::decorated_key dk,
|
||||
query::clustering_key_filter_ranges&& crr,
|
||||
lw_shared_ptr<read_context> ctx,
|
||||
partition_snapshot_ptr snp,
|
||||
lw_shared_ptr<partition_snapshot> snp,
|
||||
row_cache& cache)
|
||||
: flat_mutation_reader::impl(std::move(s))
|
||||
, _snp(std::move(snp))
|
||||
@@ -164,6 +157,9 @@ public:
|
||||
cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
|
||||
cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override;
|
||||
virtual ~cache_flat_mutation_reader() {
|
||||
maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
|
||||
}
|
||||
virtual void next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
@@ -193,22 +189,23 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
_read_context->cache().on_row_miss();
|
||||
return ensure_underlying(timeout).then([this, timeout] {
|
||||
return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
|
||||
if (sr) {
|
||||
assert(sr->is_static_row());
|
||||
maybe_add_to_cache(sr->as_static_row());
|
||||
push_mutation_fragment(std::move(*sr));
|
||||
}
|
||||
maybe_set_static_row_continuous();
|
||||
});
|
||||
return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
|
||||
if (sr) {
|
||||
assert(sr->is_static_row());
|
||||
maybe_add_to_cache(sr->as_static_row());
|
||||
push_mutation_fragment(std::move(*sr));
|
||||
}
|
||||
maybe_set_static_row_continuous();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void cache_flat_mutation_reader::touch_partition() {
|
||||
_snp->touch();
|
||||
if (_snp->at_latest_version()) {
|
||||
rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
|
||||
_snp->tracker()->touch(last_dummy);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
@@ -238,36 +235,14 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
|
||||
});
|
||||
}
|
||||
|
||||
inline
|
||||
future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
|
||||
if (_underlying) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _read_context->ensure_underlying(timeout).then([this, timeout] {
|
||||
flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
|
||||
if (ctx_underlying.schema() != _schema) {
|
||||
_underlying_holder = make_delegating_reader(ctx_underlying);
|
||||
_underlying_holder->upgrade_schema(_schema);
|
||||
_underlying = &*_underlying_holder;
|
||||
} else {
|
||||
_underlying = &ctx_underlying;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
inline
|
||||
future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
if (_state == state::move_to_underlying) {
|
||||
if (!_underlying) {
|
||||
return ensure_underlying(timeout).then([this, timeout] {
|
||||
return do_fill_buffer(timeout);
|
||||
});
|
||||
}
|
||||
_state = state::reading_from_underlying;
|
||||
_population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
|
||||
auto end = _next_row_in_range ? position_in_partition(_next_row.position())
|
||||
: position_in_partition(_upper_bound);
|
||||
return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
||||
return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
||||
return read_from_underlying(timeout);
|
||||
});
|
||||
}
|
||||
@@ -308,7 +283,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
|
||||
|
||||
inline
|
||||
future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
|
||||
return consume_mutation_fragments_until(*_underlying,
|
||||
return consume_mutation_fragments_until(_read_context->underlying().underlying(),
|
||||
[this] { return _state != state::reading_from_underlying || is_buffer_full(); },
|
||||
[this] (mutation_fragment mf) {
|
||||
_read_context->cache().on_row_miss();
|
||||
@@ -703,7 +678,7 @@ inline flat_mutation_reader make_cache_flat_mutation_reader(schema_ptr s,
|
||||
query::clustering_key_filter_ranges crr,
|
||||
row_cache& cache,
|
||||
lw_shared_ptr<cache::read_context> ctx,
|
||||
partition_snapshot_ptr snp)
|
||||
lw_shared_ptr<partition_snapshot> snp)
|
||||
{
|
||||
return make_flat_mutation_reader<cache::cache_flat_mutation_reader>(
|
||||
std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
|
||||
|
||||
@@ -24,9 +24,9 @@
|
||||
#include <boost/intrusive/unordered_set.hpp>
|
||||
|
||||
#include "utils/small_vector.hh"
|
||||
#include "fnv1a_hasher.hh"
|
||||
#include "mutation_fragment.hh"
|
||||
#include "mutation_partition.hh"
|
||||
#include "xx_hasher.hh"
|
||||
|
||||
#include "db/timeout_clock.hh"
|
||||
|
||||
@@ -194,10 +194,10 @@ private:
|
||||
explicit hasher(const schema& s) : _schema(&s) { }
|
||||
|
||||
size_t operator()(const cell_address& ca) const {
|
||||
xx_hasher hasher;
|
||||
fnv1a_hasher hasher;
|
||||
ca.position.feed_hash(hasher, *_schema);
|
||||
::feed_hash(hasher, ca.id);
|
||||
return static_cast<size_t>(hasher.finalize_uint64());
|
||||
return hasher.finalize();
|
||||
}
|
||||
size_t operator()(const cell_entry& ce) const {
|
||||
return operator()(ce._address);
|
||||
|
||||
@@ -22,7 +22,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include "keys.hh"
|
||||
#include "schema.hh"
|
||||
#include "range.hh"
|
||||
@@ -44,20 +43,22 @@ bound_kind invert_kind(bound_kind k);
|
||||
int32_t weight(bound_kind k);
|
||||
|
||||
class bound_view {
|
||||
const static thread_local clustering_key _empty_prefix;
|
||||
std::reference_wrapper<const clustering_key_prefix> _prefix;
|
||||
bound_kind _kind;
|
||||
public:
|
||||
const static thread_local clustering_key empty_prefix;
|
||||
const clustering_key_prefix& prefix;
|
||||
bound_kind kind;
|
||||
bound_view(const clustering_key_prefix& prefix, bound_kind kind)
|
||||
: _prefix(prefix)
|
||||
, _kind(kind)
|
||||
: prefix(prefix)
|
||||
, kind(kind)
|
||||
{ }
|
||||
bound_view(const bound_view& other) noexcept = default;
|
||||
bound_view& operator=(const bound_view& other) noexcept = default;
|
||||
|
||||
bound_kind kind() const { return _kind; }
|
||||
const clustering_key_prefix& prefix() const { return _prefix; }
|
||||
|
||||
bound_view& operator=(const bound_view& other) noexcept {
|
||||
if (this != &other) {
|
||||
this->~bound_view();
|
||||
new (this) bound_view(other);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
struct tri_compare {
|
||||
// To make it assignable and to avoid taking a schema_ptr, we
|
||||
// wrap the schema reference.
|
||||
@@ -81,13 +82,13 @@ public:
|
||||
return d1 < d2 ? w1 - (w1 <= 0) : -(w2 - (w2 <= 0));
|
||||
}
|
||||
int operator()(const bound_view b, const clustering_key_prefix& p) const {
|
||||
return operator()(b._prefix, weight(b._kind), p, 0);
|
||||
return operator()(b.prefix, weight(b.kind), p, 0);
|
||||
}
|
||||
int operator()(const clustering_key_prefix& p, const bound_view b) const {
|
||||
return operator()(p, 0, b._prefix, weight(b._kind));
|
||||
return operator()(p, 0, b.prefix, weight(b.kind));
|
||||
}
|
||||
int operator()(const bound_view b1, const bound_view b2) const {
|
||||
return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
|
||||
return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
|
||||
}
|
||||
};
|
||||
struct compare {
|
||||
@@ -100,26 +101,26 @@ public:
|
||||
return _cmp(p1, w1, p2, w2) < 0;
|
||||
}
|
||||
bool operator()(const bound_view b, const clustering_key_prefix& p) const {
|
||||
return operator()(b._prefix, weight(b._kind), p, 0);
|
||||
return operator()(b.prefix, weight(b.kind), p, 0);
|
||||
}
|
||||
bool operator()(const clustering_key_prefix& p, const bound_view b) const {
|
||||
return operator()(p, 0, b._prefix, weight(b._kind));
|
||||
return operator()(p, 0, b.prefix, weight(b.kind));
|
||||
}
|
||||
bool operator()(const bound_view b1, const bound_view b2) const {
|
||||
return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
|
||||
return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
|
||||
}
|
||||
};
|
||||
bool equal(const schema& s, const bound_view other) const {
|
||||
return _kind == other._kind && _prefix.get().equal(s, other._prefix.get());
|
||||
return kind == other.kind && prefix.equal(s, other.prefix);
|
||||
}
|
||||
bool adjacent(const schema& s, const bound_view other) const {
|
||||
return invert_kind(other._kind) == _kind && _prefix.get().equal(s, other._prefix.get());
|
||||
return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
|
||||
}
|
||||
static bound_view bottom() {
|
||||
return {_empty_prefix, bound_kind::incl_start};
|
||||
return {empty_prefix, bound_kind::incl_start};
|
||||
}
|
||||
static bound_view top() {
|
||||
return {_empty_prefix, bound_kind::incl_end};
|
||||
return {empty_prefix, bound_kind::incl_end};
|
||||
}
|
||||
template<template<typename> typename R>
|
||||
GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
|
||||
@@ -143,13 +144,13 @@ public:
|
||||
template<template<typename> typename R>
|
||||
GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
|
||||
static stdx::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
|
||||
if (&bv._prefix.get() == &_empty_prefix) {
|
||||
if (&bv.prefix == &empty_prefix) {
|
||||
return {};
|
||||
}
|
||||
bool inclusive = bv._kind != bound_kind::excl_end && bv._kind != bound_kind::excl_start;
|
||||
return {typename R<clustering_key_prefix_view>::bound(bv._prefix.get().view(), inclusive)};
|
||||
bool inclusive = bv.kind != bound_kind::excl_end && bv.kind != bound_kind::excl_start;
|
||||
return {typename R<clustering_key_prefix_view>::bound(bv.prefix.view(), inclusive)};
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
|
||||
return out << "{bound: prefix=" << b._prefix.get() << ", kind=" << b._kind << "}";
|
||||
return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
|
||||
}
|
||||
};
|
||||
|
||||
@@ -30,7 +30,7 @@ namespace query {
|
||||
|
||||
class clustering_key_filter_ranges {
|
||||
clustering_row_ranges _storage;
|
||||
std::reference_wrapper<const clustering_row_ranges> _ref;
|
||||
const clustering_row_ranges& _ref;
|
||||
public:
|
||||
clustering_key_filter_ranges(const clustering_row_ranges& ranges) : _ref(ranges) { }
|
||||
struct reversed { };
|
||||
@@ -39,21 +39,21 @@ public:
|
||||
|
||||
clustering_key_filter_ranges(clustering_key_filter_ranges&& other) noexcept
|
||||
: _storage(std::move(other._storage))
|
||||
, _ref(&other._ref.get() == &other._storage ? _storage : other._ref.get())
|
||||
, _ref(&other._ref == &other._storage ? _storage : other._ref)
|
||||
{ }
|
||||
|
||||
clustering_key_filter_ranges& operator=(clustering_key_filter_ranges&& other) noexcept {
|
||||
if (this != &other) {
|
||||
_storage = std::move(other._storage);
|
||||
_ref = (&other._ref.get() == &other._storage) ? _storage : other._ref.get();
|
||||
this->~clustering_key_filter_ranges();
|
||||
new (this) clustering_key_filter_ranges(std::move(other));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
auto begin() const { return _ref.get().begin(); }
|
||||
auto end() const { return _ref.get().end(); }
|
||||
bool empty() const { return _ref.get().empty(); }
|
||||
size_t size() const { return _ref.get().size(); }
|
||||
auto begin() const { return _ref.begin(); }
|
||||
auto end() const { return _ref.end(); }
|
||||
bool empty() const { return _ref.empty(); }
|
||||
size_t size() const { return _ref.size(); }
|
||||
const clustering_row_ranges& ranges() const { return _ref; }
|
||||
|
||||
static clustering_key_filter_ranges get_ranges(const schema& schema, const query::partition_slice& slice, const partition_key& key) {
|
||||
|
||||
@@ -31,61 +31,72 @@
|
||||
class clustering_ranges_walker {
|
||||
const schema& _schema;
|
||||
const query::clustering_row_ranges& _ranges;
|
||||
boost::iterator_range<query::clustering_row_ranges::const_iterator> _current_range;
|
||||
query::clustering_row_ranges::const_iterator _current;
|
||||
query::clustering_row_ranges::const_iterator _end;
|
||||
bool _in_current; // next position is known to be >= _current_start
|
||||
bool _with_static_row;
|
||||
position_in_partition_view _current_start;
|
||||
position_in_partition_view _current_end;
|
||||
std::optional<position_in_partition> _trim;
|
||||
stdx::optional<position_in_partition> _trim;
|
||||
size_t _change_counter = 1;
|
||||
private:
|
||||
bool advance_to_next_range() {
|
||||
_in_current = false;
|
||||
if (!_current_start.is_static_row()) {
|
||||
if (!_current_range) {
|
||||
if (_current == _end) {
|
||||
return false;
|
||||
}
|
||||
_current_range.advance_begin(1);
|
||||
++_current;
|
||||
}
|
||||
++_change_counter;
|
||||
if (!_current_range) {
|
||||
if (_current == _end) {
|
||||
_current_end = _current_start = position_in_partition_view::after_all_clustered_rows();
|
||||
return false;
|
||||
}
|
||||
_current_start = position_in_partition_view::for_range_start(_current_range.front());
|
||||
_current_end = position_in_partition_view::for_range_end(_current_range.front());
|
||||
_current_start = position_in_partition_view::for_range_start(*_current);
|
||||
_current_end = position_in_partition_view::for_range_end(*_current);
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_current_positions() {
|
||||
if (!_with_static_row) {
|
||||
if (!_current_range) {
|
||||
public:
|
||||
clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
|
||||
: _schema(s)
|
||||
, _ranges(ranges)
|
||||
, _current(ranges.begin())
|
||||
, _end(ranges.end())
|
||||
, _in_current(with_static_row)
|
||||
, _with_static_row(with_static_row)
|
||||
, _current_start(position_in_partition_view::for_static_row())
|
||||
, _current_end(position_in_partition_view::before_all_clustered_rows())
|
||||
{
|
||||
if (!with_static_row) {
|
||||
if (_current == _end) {
|
||||
_current_start = position_in_partition_view::before_all_clustered_rows();
|
||||
} else {
|
||||
_current_start = position_in_partition_view::for_range_start(_current_range.front());
|
||||
_current_end = position_in_partition_view::for_range_end(_current_range.front());
|
||||
_current_start = position_in_partition_view::for_range_start(*_current);
|
||||
_current_end = position_in_partition_view::for_range_end(*_current);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
clustering_ranges_walker(const schema& s, const query::clustering_row_ranges& ranges, bool with_static_row = true)
|
||||
: _schema(s)
|
||||
, _ranges(ranges)
|
||||
, _current_range(ranges)
|
||||
, _in_current(with_static_row)
|
||||
, _with_static_row(with_static_row)
|
||||
, _current_start(position_in_partition_view::for_static_row())
|
||||
, _current_end(position_in_partition_view::before_all_clustered_rows()) {
|
||||
set_current_positions();
|
||||
clustering_ranges_walker(clustering_ranges_walker&& o) noexcept
|
||||
: _schema(o._schema)
|
||||
, _ranges(o._ranges)
|
||||
, _current(o._current)
|
||||
, _end(o._end)
|
||||
, _in_current(o._in_current)
|
||||
, _with_static_row(o._with_static_row)
|
||||
, _current_start(o._current_start)
|
||||
, _current_end(o._current_end)
|
||||
, _trim(std::move(o._trim))
|
||||
, _change_counter(o._change_counter)
|
||||
{ }
|
||||
clustering_ranges_walker& operator=(clustering_ranges_walker&& o) {
|
||||
if (this != &o) {
|
||||
this->~clustering_ranges_walker();
|
||||
new (this) clustering_ranges_walker(std::move(o));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
clustering_ranges_walker(const clustering_ranges_walker&) = delete;
|
||||
clustering_ranges_walker(clustering_ranges_walker&&) = delete;
|
||||
|
||||
clustering_ranges_walker& operator=(const clustering_ranges_walker&) = delete;
|
||||
clustering_ranges_walker& operator=(clustering_ranges_walker&&) = delete;
|
||||
|
||||
// Excludes positions smaller than pos from the ranges.
|
||||
// pos should be monotonic.
|
||||
// No constraints between pos and positions passed to advance_to().
|
||||
@@ -162,15 +173,17 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const auto& rng : _current_range) {
|
||||
auto range_start = position_in_partition_view::for_range_start(rng);
|
||||
auto i = _current;
|
||||
while (i != _end) {
|
||||
auto range_start = position_in_partition_view::for_range_start(*i);
|
||||
if (!less(range_start, end)) {
|
||||
return false;
|
||||
}
|
||||
auto range_end = position_in_partition_view::for_range_end(rng);
|
||||
auto range_end = position_in_partition_view::for_range_end(*i);
|
||||
if (less(start, range_end)) {
|
||||
return true;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
return false;
|
||||
@@ -178,20 +191,18 @@ public:
|
||||
|
||||
// Returns true if advanced past all contained positions. Any later advance_to() until reset() will return false.
|
||||
bool out_of_range() const {
|
||||
return !_in_current && !_current_range;
|
||||
return !_in_current && _current == _end;
|
||||
}
|
||||
|
||||
// Resets the state of the walker so that advance_to() can be now called for new sequence of positions.
|
||||
// Any range trimmings still hold after this.
|
||||
void reset() {
|
||||
_current_range = _ranges;
|
||||
_in_current = _with_static_row;
|
||||
_current_start = position_in_partition_view::for_static_row();
|
||||
_current_end = position_in_partition_view::before_all_clustered_rows();
|
||||
set_current_positions();
|
||||
++_change_counter;
|
||||
if (_trim) {
|
||||
trim_front(*std::exchange(_trim, {}));
|
||||
auto trim = std::move(_trim);
|
||||
auto ctr = _change_counter;
|
||||
*this = clustering_ranges_walker(_schema, _ranges, _with_static_row);
|
||||
_change_counter = ctr + 1;
|
||||
if (trim) {
|
||||
trim_front(std::move(*trim));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,11 +211,6 @@ public:
|
||||
return _current_start;
|
||||
}
|
||||
|
||||
// Returns the upper bound of the last range in provided ranges set
|
||||
position_in_partition_view uppermost_bound() const {
|
||||
return position_in_partition_view::for_range_end(_ranges.back());
|
||||
}
|
||||
|
||||
// When lower_bound() changes, this also does
|
||||
// Always > 0.
|
||||
size_t lower_bound_change_counter() const {
|
||||
|
||||
67
compatible_ring_position.hh
Normal file
67
compatible_ring_position.hh
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Copyright (C) 2016 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "query-request.hh"
|
||||
#include <experimental/optional>
|
||||
|
||||
// Wraps ring_position so it is compatible with old-style C++: default constructor,
|
||||
// stateless comparators, yada yada
|
||||
class compatible_ring_position {
|
||||
const schema* _schema = nullptr;
|
||||
// optional to supply a default constructor, no more
|
||||
std::experimental::optional<dht::ring_position> _rp;
|
||||
public:
|
||||
compatible_ring_position() noexcept = default;
|
||||
compatible_ring_position(const schema& s, const dht::ring_position& rp)
|
||||
: _schema(&s), _rp(rp) {
|
||||
}
|
||||
compatible_ring_position(const schema& s, dht::ring_position&& rp)
|
||||
: _schema(&s), _rp(std::move(rp)) {
|
||||
}
|
||||
const dht::token& token() const {
|
||||
return _rp->token();
|
||||
}
|
||||
friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return x._rp->tri_compare(*x._schema, *y._rp);
|
||||
}
|
||||
friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) < 0;
|
||||
}
|
||||
friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) <= 0;
|
||||
}
|
||||
friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) > 0;
|
||||
}
|
||||
friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) >= 0;
|
||||
}
|
||||
friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) == 0;
|
||||
}
|
||||
friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2016 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "query-request.hh"
|
||||
#include <optional>
|
||||
|
||||
// Wraps ring_position_view so it is compatible with old-style C++: default
|
||||
// constructor, stateless comparators, yada yada.
|
||||
class compatible_ring_position_view {
|
||||
const schema* _schema = nullptr;
|
||||
// Optional to supply a default constructor, no more.
|
||||
std::optional<dht::ring_position_view> _rpv;
|
||||
public:
|
||||
constexpr compatible_ring_position_view() = default;
|
||||
compatible_ring_position_view(const schema& s, dht::ring_position_view rpv)
|
||||
: _schema(&s), _rpv(rpv) {
|
||||
}
|
||||
const dht::ring_position_view& position() const {
|
||||
return *_rpv;
|
||||
}
|
||||
friend int tri_compare(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return dht::ring_position_tri_compare(*x._schema, *x._rpv, *y._rpv);
|
||||
}
|
||||
friend bool operator<(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) < 0;
|
||||
}
|
||||
friend bool operator<=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) <= 0;
|
||||
}
|
||||
friend bool operator>(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) > 0;
|
||||
}
|
||||
friend bool operator>=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) >= 0;
|
||||
}
|
||||
friend bool operator==(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) == 0;
|
||||
}
|
||||
friend bool operator!=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
|
||||
const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";
|
||||
|
||||
compression_parameters::compression_parameters()
|
||||
: compression_parameters(compressor::lz4)
|
||||
: compression_parameters(nullptr)
|
||||
{}
|
||||
|
||||
compression_parameters::~compression_parameters()
|
||||
|
||||
@@ -118,10 +118,6 @@ public:
|
||||
std::map<sstring, sstring> get_options() const;
|
||||
bool operator==(const compression_parameters& other) const;
|
||||
bool operator!=(const compression_parameters& other) const;
|
||||
|
||||
static compression_parameters no_compression() {
|
||||
return compression_parameters(nullptr);
|
||||
}
|
||||
private:
|
||||
void validate_options(const std::map<sstring, sstring>&);
|
||||
};
|
||||
|
||||
@@ -242,9 +242,6 @@ batch_size_fail_threshold_in_kb: 50
|
||||
|
||||
# The directory where hints files are stored if hinted handoff is enabled.
|
||||
# hints_directory: /var/lib/scylla/hints
|
||||
|
||||
# The directory where hints files are stored for materialized-view updates
|
||||
# view_hints_directory: /var/lib/scylla/view_hints
|
||||
|
||||
# See http://wiki.apache.org/cassandra/HintedHandoff
|
||||
# May either be "true" or "false" to enable globally, or contain a list
|
||||
|
||||
777
configure.py
777
configure.py
File diff suppressed because it is too large
Load Diff
@@ -38,44 +38,44 @@ private:
|
||||
static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
|
||||
return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
|
||||
}
|
||||
static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
|
||||
atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
|
||||
if (cell.is_live() && !old_type.is_counter()) {
|
||||
if (cell.is_live_and_has_ttl()) {
|
||||
return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
|
||||
}
|
||||
return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
|
||||
} else {
|
||||
return atomic_cell(new_type, cell);
|
||||
}
|
||||
}
|
||||
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
|
||||
if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
|
||||
return;
|
||||
}
|
||||
dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
|
||||
auto new_cell = [&] {
|
||||
if (cell.is_live() && !old_type->is_counter()) {
|
||||
if (cell.is_live_and_has_ttl()) {
|
||||
return atomic_cell_or_collection(
|
||||
atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
|
||||
);
|
||||
}
|
||||
return atomic_cell_or_collection(
|
||||
atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
|
||||
);
|
||||
} else {
|
||||
return atomic_cell_or_collection(*new_def.type, cell);
|
||||
}
|
||||
}();
|
||||
dst.apply(new_def, std::move(new_cell));
|
||||
}
|
||||
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
|
||||
if (!is_compatible(new_def, old_type, kind)) {
|
||||
return;
|
||||
}
|
||||
cell.data.with_linearized([&] (bytes_view cell_bv) {
|
||||
auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
|
||||
auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
|
||||
auto old_view = old_ctype->deserialize_mutation_form(cell_bv);
|
||||
auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
|
||||
auto old_view = ctype->deserialize_mutation_form(cell_bv);
|
||||
|
||||
collection_type_impl::mutation new_view;
|
||||
collection_type_impl::mutation_view new_view;
|
||||
if (old_view.tomb.timestamp > new_def.dropped_at()) {
|
||||
new_view.tomb = old_view.tomb;
|
||||
}
|
||||
for (auto& c : old_view.cells) {
|
||||
if (c.second.timestamp() > new_def.dropped_at()) {
|
||||
new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
|
||||
new_view.cells.emplace_back(std::move(c));
|
||||
}
|
||||
}
|
||||
if (new_view.tomb || !new_view.cells.empty()) {
|
||||
dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
|
||||
}
|
||||
dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
|
||||
});
|
||||
}
|
||||
public:
|
||||
@@ -92,10 +92,6 @@ public:
|
||||
_p.apply(t);
|
||||
}
|
||||
|
||||
void accept_static_cell(column_id id, atomic_cell cell) {
|
||||
return accept_static_cell(id, atomic_cell_view(cell));
|
||||
}
|
||||
|
||||
virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
|
||||
const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
|
||||
const column_definition* def = _p_schema.get_column_definition(col.name());
|
||||
@@ -123,10 +119,6 @@ public:
|
||||
_current_row = &r;
|
||||
}
|
||||
|
||||
void accept_row_cell(column_id id, atomic_cell cell) {
|
||||
return accept_row_cell(id, atomic_cell_view(cell));
|
||||
}
|
||||
|
||||
virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
|
||||
const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
|
||||
const column_definition* def = _p_schema.get_column_definition(col.name());
|
||||
|
||||
59
cql3/Cql.g
59
cql3/Cql.g
@@ -470,7 +470,6 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
|
||||
std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
|
||||
std::vector<::shared_ptr<cql3::term::raw>> values;
|
||||
bool if_not_exists = false;
|
||||
bool default_unset = false;
|
||||
::shared_ptr<cql3::term::raw> json_value;
|
||||
}
|
||||
: K_INSERT K_INTO cf=columnFamilyName
|
||||
@@ -488,15 +487,13 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
|
||||
}
|
||||
| K_JSON
|
||||
json_token=jsonValue { json_value = $json_token.value; }
|
||||
( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
|
||||
( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
|
||||
( usingClause[attrs] )?
|
||||
{
|
||||
$expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
|
||||
std::move(attrs),
|
||||
std::move(json_value),
|
||||
if_not_exists,
|
||||
default_unset);
|
||||
if_not_exists);
|
||||
}
|
||||
)
|
||||
;
|
||||
@@ -1534,22 +1531,12 @@ inMarkerForTuple returns [shared_ptr<cql3::tuples::in_raw> marker]
|
||||
| ':' name=ident { $marker = new_tuple_in_bind_variables(name); }
|
||||
;
|
||||
|
||||
// The comparator_type rule is used for users' queries (internal=false)
|
||||
// and for internal calls from db::cql_type_parser::parse() (internal=true).
|
||||
// The latter is used for reading schemas stored in the system tables, and
|
||||
// may support additional column types that cannot be created through CQL,
|
||||
// but only internally through code. Today the only such type is "empty":
|
||||
// Scylla code internally creates columns with type "empty" or collections
|
||||
// "empty" to represent unselected columns in materialized views.
|
||||
// If a user (internal=false) tries to use "empty" as a type, it is treated -
|
||||
// as do all unknown types - as an attempt to use a user-defined type, and
|
||||
// we report this name is reserved (as for _reserved_type_names()).
|
||||
comparator_type [bool internal] returns [shared_ptr<cql3_type::raw> t]
|
||||
: n=native_or_internal_type[internal] { $t = cql3_type::raw::from(n); }
|
||||
| c=collection_type[internal] { $t = c; }
|
||||
| tt=tuple_type[internal] { $t = tt; }
|
||||
comparatorType returns [shared_ptr<cql3_type::raw> t]
|
||||
: n=native_type { $t = cql3_type::raw::from(n); }
|
||||
| c=collection_type { $t = c; }
|
||||
| tt=tuple_type { $t = tt; }
|
||||
| id=userTypeName { $t = cql3::cql3_type::raw::user_type(id); }
|
||||
| K_FROZEN '<' f=comparator_type[internal] '>'
|
||||
| K_FROZEN '<' f=comparatorType '>'
|
||||
{
|
||||
try {
|
||||
$t = cql3::cql3_type::raw::frozen(f);
|
||||
@@ -1571,22 +1558,6 @@ comparator_type [bool internal] returns [shared_ptr<cql3_type::raw> t]
|
||||
#endif
|
||||
;
|
||||
|
||||
native_or_internal_type [bool internal] returns [shared_ptr<cql3_type> t]
|
||||
: n=native_type { $t = n; }
|
||||
// The "internal" types, only supported when internal==true:
|
||||
| K_EMPTY {
|
||||
if (internal) {
|
||||
$t = cql3_type::empty;
|
||||
} else {
|
||||
add_recognition_error("Invalid (reserved) user type name empty");
|
||||
}
|
||||
}
|
||||
;
|
||||
|
||||
comparatorType returns [shared_ptr<cql3_type::raw> t]
|
||||
: tt=comparator_type[false] { $t = tt; }
|
||||
;
|
||||
|
||||
native_type returns [shared_ptr<cql3_type> t]
|
||||
: K_ASCII { $t = cql3_type::ascii; }
|
||||
| K_BIGINT { $t = cql3_type::bigint; }
|
||||
@@ -1611,24 +1582,24 @@ native_type returns [shared_ptr<cql3_type> t]
|
||||
| K_TIME { $t = cql3_type::time; }
|
||||
;
|
||||
|
||||
collection_type [bool internal] returns [shared_ptr<cql3::cql3_type::raw> pt]
|
||||
: K_MAP '<' t1=comparator_type[internal] ',' t2=comparator_type[internal] '>'
|
||||
collection_type returns [shared_ptr<cql3::cql3_type::raw> pt]
|
||||
: K_MAP '<' t1=comparatorType ',' t2=comparatorType '>'
|
||||
{
|
||||
// if we can't parse either t1 or t2, antlr will "recover" and we may have t1 or t2 null.
|
||||
if (t1 && t2) {
|
||||
$pt = cql3::cql3_type::raw::map(t1, t2);
|
||||
}
|
||||
}
|
||||
| K_LIST '<' t=comparator_type[internal] '>'
|
||||
| K_LIST '<' t=comparatorType '>'
|
||||
{ if (t) { $pt = cql3::cql3_type::raw::list(t); } }
|
||||
| K_SET '<' t=comparator_type[internal] '>'
|
||||
| K_SET '<' t=comparatorType '>'
|
||||
{ if (t) { $pt = cql3::cql3_type::raw::set(t); } }
|
||||
;
|
||||
|
||||
tuple_type [bool internal] returns [shared_ptr<cql3::cql3_type::raw> t]
|
||||
tuple_type returns [shared_ptr<cql3::cql3_type::raw> t]
|
||||
@init{ std::vector<shared_ptr<cql3::cql3_type::raw>> types; }
|
||||
: K_TUPLE '<'
|
||||
t1=comparator_type[internal] { types.push_back(t1); } (',' tn=comparator_type[internal] { types.push_back(tn); })*
|
||||
t1=comparatorType { types.push_back(t1); } (',' tn=comparatorType { types.push_back(tn); })*
|
||||
'>' { $t = cql3::cql3_type::raw::tuple(std::move(types)); }
|
||||
;
|
||||
|
||||
@@ -1654,7 +1625,7 @@ unreserved_keyword returns [sstring str]
|
||||
|
||||
unreserved_function_keyword returns [sstring str]
|
||||
: u=basic_unreserved_keyword { $str = u; }
|
||||
| t=native_or_internal_type[true] { $str = t->to_string(); }
|
||||
| t=native_type { $str = t->to_string(); }
|
||||
;
|
||||
|
||||
basic_unreserved_keyword returns [sstring str]
|
||||
@@ -1838,10 +1809,6 @@ K_OR: O R;
|
||||
K_REPLACE: R E P L A C E;
|
||||
K_DETERMINISTIC: D E T E R M I N I S T I C;
|
||||
K_JSON: J S O N;
|
||||
K_DEFAULT: D E F A U L T;
|
||||
K_UNSET: U N S E T;
|
||||
|
||||
K_EMPTY: E M P T Y;
|
||||
|
||||
K_SCYLLA_TIMEUUID_LIST_INDEX: S C Y L L A '_' T I M E U U I D '_' L I S T '_' I N D E X;
|
||||
K_SCYLLA_COUNTER_SHARD_LIST: S C Y L L A '_' C O U N T E R '_' S H A R D '_' L I S T;
|
||||
|
||||
@@ -77,14 +77,12 @@ int64_t attributes::get_timestamp(int64_t now, const query_options& options) {
|
||||
if (tval.is_unset_value()) {
|
||||
return now;
|
||||
}
|
||||
return with_linearized(*tval, [] (bytes_view val) {
|
||||
try {
|
||||
data_type_for<int64_t>()->validate(val);
|
||||
data_type_for<int64_t>()->validate(*tval);
|
||||
} catch (marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception("Invalid timestamp value");
|
||||
}
|
||||
return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(val));
|
||||
});
|
||||
return value_cast<int64_t>(data_type_for<int64_t>()->deserialize(*tval));
|
||||
}
|
||||
|
||||
int32_t attributes::get_time_to_live(const query_options& options) {
|
||||
@@ -98,16 +96,14 @@ int32_t attributes::get_time_to_live(const query_options& options) {
|
||||
if (tval.is_unset_value()) {
|
||||
return 0;
|
||||
}
|
||||
auto ttl = with_linearized(*tval, [] (bytes_view val) {
|
||||
try {
|
||||
data_type_for<int32_t>()->validate(val);
|
||||
data_type_for<int32_t>()->validate(*tval);
|
||||
}
|
||||
catch (marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception("Invalid TTL value");
|
||||
}
|
||||
|
||||
return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(val));
|
||||
});
|
||||
auto ttl = value_cast<int32_t>(data_type_for<int32_t>()->deserialize(*tval));
|
||||
if (ttl < 0) {
|
||||
throw exceptions::invalid_request_exception("A TTL must be greater or equal to 0");
|
||||
}
|
||||
|
||||
@@ -127,11 +127,7 @@ column_identifier::new_selector_factory(database& db, schema_ptr schema, std::ve
|
||||
if (!def) {
|
||||
throw exceptions::invalid_request_exception(sprint("Undefined name %s in selection clause", _text));
|
||||
}
|
||||
// Do not allow explicitly selecting hidden columns. We also skip them on
|
||||
// "SELECT *" (see selection::wildcard()).
|
||||
if (def->is_view_virtual()) {
|
||||
throw exceptions::invalid_request_exception(sprint("Undefined name %s in selection clause", _text));
|
||||
}
|
||||
|
||||
return selection::simple_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), def->type);
|
||||
}
|
||||
|
||||
|
||||
@@ -225,9 +225,7 @@ public:
|
||||
} else if (value.is_unset_value()) {
|
||||
return;
|
||||
}
|
||||
auto increment = with_linearized(*value, [] (bytes_view value_view) {
|
||||
return value_cast<int64_t>(long_type->deserialize_value(value_view));
|
||||
});
|
||||
auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
|
||||
m.set_cell(prefix, column, make_counter_update_cell(increment, params));
|
||||
}
|
||||
};
|
||||
@@ -242,9 +240,7 @@ public:
|
||||
} else if (value.is_unset_value()) {
|
||||
return;
|
||||
}
|
||||
auto increment = with_linearized(*value, [] (bytes_view value_view) {
|
||||
return value_cast<int64_t>(long_type->deserialize_value(value_view));
|
||||
});
|
||||
auto increment = value_cast<int64_t>(long_type->deserialize_value(*value));
|
||||
if (increment == std::numeric_limits<int64_t>::min()) {
|
||||
throw exceptions::invalid_request_exception(sprint("The negation of %d overflows supported counter precision (signed 8 bytes integer)", increment));
|
||||
}
|
||||
|
||||
@@ -461,9 +461,9 @@ function_call::make_terminal(shared_ptr<function> fun, cql3::raw_value result, c
|
||||
}
|
||||
|
||||
auto ctype = static_pointer_cast<const collection_type_impl>(fun->return_type());
|
||||
fragmented_temporary_buffer::view res;
|
||||
bytes_view res;
|
||||
if (result) {
|
||||
res = fragmented_temporary_buffer::view(bytes_view(*result));
|
||||
res = *result;
|
||||
}
|
||||
if (&ctype->_kind == &collection_type_impl::kind::list) {
|
||||
return make_shared(lists::value::from_serialized(std::move(res), static_pointer_cast<const list_type_impl>(ctype), sf));
|
||||
|
||||
@@ -115,12 +115,11 @@ lists::literal::to_string() const {
|
||||
}
|
||||
|
||||
lists::value
|
||||
lists::value::from_serialized(const fragmented_temporary_buffer::view& val, list_type type, cql_serialization_format sf) {
|
||||
lists::value::from_serialized(bytes_view v, list_type type, cql_serialization_format sf) {
|
||||
try {
|
||||
// Collections have this small hack that validate cannot be called on a serialized object,
|
||||
// but compose does the validation (so we're fine).
|
||||
// FIXME: deserializeForNativeProtocol()?!
|
||||
return with_linearized(val, [&] (bytes_view v) {
|
||||
auto l = value_cast<list_type_impl::native_type>(type->deserialize(v, sf));
|
||||
std::vector<bytes_opt> elements;
|
||||
elements.reserve(l.size());
|
||||
@@ -129,7 +128,6 @@ lists::value::from_serialized(const fragmented_temporary_buffer::view& val, list
|
||||
elements.push_back(element.is_null() ? bytes_opt() : bytes_opt(type->get_elements_type()->decompose(element)));
|
||||
}
|
||||
return value(std::move(elements));
|
||||
});
|
||||
} catch (marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception(e.what());
|
||||
}
|
||||
@@ -287,9 +285,7 @@ lists::setter_by_index::execute(mutation& m, const clustering_key_prefix& prefix
|
||||
return;
|
||||
}
|
||||
|
||||
auto idx = with_linearized(*index, [] (bytes_view v) {
|
||||
return value_cast<int32_t>(data_type_for<int32_t>()->deserialize(v));
|
||||
});
|
||||
auto idx = net::ntoh(int32_t(*unaligned_cast<int32_t>(index->begin())));
|
||||
auto&& existing_list_opt = params.get_prefetched_list(m.key().view(), prefix.view(), column);
|
||||
if (!existing_list_opt) {
|
||||
throw exceptions::invalid_request_exception("Attempted to set an element on a list which is null");
|
||||
|
||||
@@ -79,7 +79,7 @@ public:
|
||||
explicit value(std::vector<bytes_opt> elements)
|
||||
: _elements(std::move(elements)) {
|
||||
}
|
||||
static value from_serialized(const fragmented_temporary_buffer::view& v, list_type type, cql_serialization_format sf);
|
||||
static value from_serialized(bytes_view v, list_type type, cql_serialization_format sf);
|
||||
virtual cql3::raw_value get(const query_options& options) override;
|
||||
virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
|
||||
bool equals(shared_ptr<list_type_impl> lt, const value& v);
|
||||
|
||||
14
cql3/maps.cc
14
cql3/maps.cc
@@ -152,20 +152,18 @@ maps::literal::to_string() const {
|
||||
}
|
||||
|
||||
maps::value
|
||||
maps::value::from_serialized(const fragmented_temporary_buffer::view& fragmented_value, map_type type, cql_serialization_format sf) {
|
||||
maps::value::from_serialized(bytes_view value, map_type type, cql_serialization_format sf) {
|
||||
try {
|
||||
// Collections have this small hack that validate cannot be called on a serialized object,
|
||||
// but compose does the validation (so we're fine).
|
||||
// FIXME: deserialize_for_native_protocol?!
|
||||
return with_linearized(fragmented_value, [&] (bytes_view value) {
|
||||
auto m = value_cast<map_type_impl::native_type>(type->deserialize(value, sf));
|
||||
std::map<bytes, bytes, serialized_compare> map(type->get_keys_type()->as_less_comparator());
|
||||
for (auto&& e : m) {
|
||||
map.emplace(type->get_keys_type()->decompose(e.first),
|
||||
type->get_values_type()->decompose(e.second));
|
||||
}
|
||||
return maps::value { std::move(map) };
|
||||
});
|
||||
return { std::move(map) };
|
||||
} catch (marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception(e.what());
|
||||
}
|
||||
@@ -235,10 +233,10 @@ maps::delayed_value::bind(const query_options& options) {
|
||||
if (key_bytes.is_unset_value()) {
|
||||
throw exceptions::invalid_request_exception("unset value is not supported inside collections");
|
||||
}
|
||||
if (key_bytes->size_bytes() > std::numeric_limits<uint16_t>::max()) {
|
||||
if (key_bytes->size() > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(sprint("Map key is too long. Map keys are limited to %d bytes but %d bytes keys provided",
|
||||
std::numeric_limits<uint16_t>::max(),
|
||||
key_bytes->size_bytes()));
|
||||
key_bytes->size()));
|
||||
}
|
||||
auto value_bytes = value->bind_and_get(options);
|
||||
if (value_bytes.is_null()) {
|
||||
@@ -333,7 +331,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para
|
||||
|
||||
auto ctype = static_pointer_cast<const map_type_impl>(column.type);
|
||||
for (auto&& e : map_value->map) {
|
||||
mut.cells.emplace_back(e.first, params.make_cell(*ctype->get_values_type(), fragmented_temporary_buffer::view(e.second), atomic_cell::collection_member::yes));
|
||||
mut.cells.emplace_back(e.first, params.make_cell(*ctype->get_values_type(), e.second, atomic_cell::collection_member::yes));
|
||||
}
|
||||
auto col_mut = ctype->serialize_mutation_form(std::move(mut));
|
||||
m.set_cell(prefix, column, std::move(col_mut));
|
||||
@@ -344,7 +342,7 @@ maps::do_put(mutation& m, const clustering_key_prefix& prefix, const update_para
|
||||
} else {
|
||||
auto v = map_type_impl::serialize_partially_deserialized_form({map_value->map.begin(), map_value->map.end()},
|
||||
cql_serialization_format::internal());
|
||||
m.set_cell(prefix, column, params.make_cell(*column.type, fragmented_temporary_buffer::view(std::move(v))));
|
||||
m.set_cell(prefix, column, params.make_cell(*column.type, std::move(v)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +81,7 @@ public:
|
||||
value(std::map<bytes, bytes, serialized_compare> map)
|
||||
: map(std::move(map)) {
|
||||
}
|
||||
static value from_serialized(const fragmented_temporary_buffer::view& value, map_type type, cql_serialization_format sf);
|
||||
static value from_serialized(bytes_view value, map_type type, cql_serialization_format sf);
|
||||
virtual cql3::raw_value get(const query_options& options) override;
|
||||
virtual bytes get_with_protocol_version(cql_serialization_format sf);
|
||||
bool equals(map_type mt, const value& v);
|
||||
|
||||
@@ -92,10 +92,6 @@ public:
|
||||
}
|
||||
|
||||
static atomic_cell make_cell(const abstract_type& type, bytes_view value, const update_parameters& params) {
|
||||
return params.make_cell(type, fragmented_temporary_buffer::view(value));
|
||||
}
|
||||
|
||||
static atomic_cell make_cell(const abstract_type& type, const fragmented_temporary_buffer::view& value, const update_parameters& params) {
|
||||
return params.make_cell(type, value);
|
||||
}
|
||||
|
||||
|
||||
@@ -130,49 +130,84 @@ query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<ser
|
||||
|
||||
}
|
||||
|
||||
query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size)
|
||||
: query_options(qo->_consistency,
|
||||
qo->get_timeout_config(),
|
||||
std::move(qo->_names),
|
||||
std::move(qo->_values),
|
||||
std::move(qo->_value_views),
|
||||
qo->_skip_metadata,
|
||||
std::move(query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}),
|
||||
qo->_cql_serialization_format) {
|
||||
|
||||
}
|
||||
|
||||
query_options::query_options(std::vector<cql3::raw_value> values)
|
||||
: query_options(
|
||||
db::consistency_level::ONE, infinite_timeout_config, std::move(values))
|
||||
{}
|
||||
|
||||
db::consistency_level query_options::get_consistency() const
|
||||
{
|
||||
return _consistency;
|
||||
}
|
||||
|
||||
cql3::raw_value_view query_options::get_value_at(size_t idx) const
|
||||
{
|
||||
return _value_views.at(idx);
|
||||
}
|
||||
|
||||
size_t query_options::get_values_count() const
|
||||
{
|
||||
return _value_views.size();
|
||||
}
|
||||
|
||||
cql3::raw_value_view query_options::make_temporary(cql3::raw_value value) const
|
||||
{
|
||||
if (value) {
|
||||
auto value_view = *value;
|
||||
auto ptr = _temporaries.write_place_holder(value_view.size());
|
||||
std::copy_n(value_view.data(), value_view.size(), ptr);
|
||||
return cql3::raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{ptr, value_view.size()}));
|
||||
_temporaries.emplace_back(value->begin(), value->end());
|
||||
auto& temporary = _temporaries.back();
|
||||
return cql3::raw_value_view::make_value(bytes_view{temporary.data(), temporary.size()});
|
||||
}
|
||||
return cql3::raw_value_view::make_null();
|
||||
}
|
||||
|
||||
bytes_view query_options::linearize(fragmented_temporary_buffer::view view) const
|
||||
bool query_options::skip_metadata() const
|
||||
{
|
||||
if (view.empty()) {
|
||||
return { };
|
||||
} else if (std::next(view.begin()) == view.end()) {
|
||||
return *view.begin();
|
||||
} else {
|
||||
auto ptr = _temporaries.write_place_holder(view.size_bytes());
|
||||
auto dst = ptr;
|
||||
using boost::range::for_each;
|
||||
for_each(view, [&] (bytes_view bv) {
|
||||
dst = std::copy(bv.begin(), bv.end(), dst);
|
||||
});
|
||||
return bytes_view(ptr, view.size_bytes());
|
||||
return _skip_metadata;
|
||||
}
|
||||
|
||||
int32_t query_options::get_page_size() const
|
||||
{
|
||||
return get_specific_options().page_size;
|
||||
}
|
||||
|
||||
::shared_ptr<service::pager::paging_state> query_options::get_paging_state() const
|
||||
{
|
||||
return get_specific_options().state;
|
||||
}
|
||||
|
||||
std::experimental::optional<db::consistency_level> query_options::get_serial_consistency() const
|
||||
{
|
||||
return get_specific_options().serial_consistency;
|
||||
}
|
||||
|
||||
api::timestamp_type query_options::get_timestamp(service::query_state& state) const
|
||||
{
|
||||
auto tstamp = get_specific_options().timestamp;
|
||||
return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
|
||||
}
|
||||
|
||||
int query_options::get_protocol_version() const
|
||||
{
|
||||
return _cql_serialization_format.protocol_version();
|
||||
}
|
||||
|
||||
cql_serialization_format query_options::get_cql_serialization_format() const
|
||||
{
|
||||
return _cql_serialization_format;
|
||||
}
|
||||
|
||||
const query_options::specific_options& query_options::get_specific_options() const
|
||||
{
|
||||
return _options;
|
||||
}
|
||||
|
||||
const query_options& query_options::for_statement(size_t i) const
|
||||
{
|
||||
if (!_batch_options) {
|
||||
// No per-statement options supplied, so use the "global" options
|
||||
return *this;
|
||||
}
|
||||
return _batch_options->at(i);
|
||||
}
|
||||
|
||||
void query_options::prepare(const std::vector<::shared_ptr<column_specification>>& specs)
|
||||
@@ -199,7 +234,11 @@ void query_options::prepare(const std::vector<::shared_ptr<column_specification>
|
||||
void query_options::fill_value_views()
|
||||
{
|
||||
for (auto&& value : _values) {
|
||||
_value_views.emplace_back(value.to_view());
|
||||
if (value) {
|
||||
_value_views.emplace_back(cql3::raw_value_view::make_value(bytes_view{*value}));
|
||||
} else {
|
||||
_value_views.emplace_back(cql3::raw_value_view::make_null());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ private:
|
||||
const std::experimental::optional<std::vector<sstring_view>> _names;
|
||||
std::vector<cql3::raw_value> _values;
|
||||
std::vector<cql3::raw_value_view> _value_views;
|
||||
mutable bytes_ostream _temporaries;
|
||||
mutable std::vector<std::vector<int8_t>> _temporaries;
|
||||
const bool _skip_metadata;
|
||||
const specific_options _options;
|
||||
cql_serialization_format _cql_serialization_format;
|
||||
@@ -102,7 +102,7 @@ private:
|
||||
|
||||
public:
|
||||
query_options(query_options&&) = default;
|
||||
explicit query_options(const query_options&) = default;
|
||||
query_options(const query_options&) = delete;
|
||||
|
||||
explicit query_options(db::consistency_level consistency,
|
||||
const timeout_config& timeouts,
|
||||
@@ -155,78 +155,34 @@ public:
|
||||
explicit query_options(db::consistency_level, const timeout_config& timeouts,
|
||||
std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
|
||||
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
|
||||
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);
|
||||
|
||||
db::consistency_level get_consistency() const;
|
||||
const timeout_config& get_timeout_config() const { return _timeout_config; }
|
||||
|
||||
db::consistency_level get_consistency() const {
|
||||
return _consistency;
|
||||
}
|
||||
|
||||
cql3::raw_value_view get_value_at(size_t idx) const {
|
||||
return _value_views.at(idx);
|
||||
}
|
||||
|
||||
size_t get_values_count() const {
|
||||
return _value_views.size();
|
||||
}
|
||||
|
||||
cql3::raw_value_view get_value_at(size_t idx) const;
|
||||
cql3::raw_value_view make_temporary(cql3::raw_value value) const;
|
||||
bytes_view linearize(fragmented_temporary_buffer::view) const;
|
||||
|
||||
bool skip_metadata() const {
|
||||
return _skip_metadata;
|
||||
}
|
||||
|
||||
int32_t get_page_size() const {
|
||||
return get_specific_options().page_size;
|
||||
}
|
||||
|
||||
size_t get_values_count() const;
|
||||
bool skip_metadata() const;
|
||||
/** The pageSize for this query. Will be <= 0 if not relevant for the query. */
|
||||
int32_t get_page_size() const;
|
||||
/** The paging state for this query, or null if not relevant. */
|
||||
::shared_ptr<service::pager::paging_state> get_paging_state() const {
|
||||
return get_specific_options().state;
|
||||
}
|
||||
|
||||
::shared_ptr<service::pager::paging_state> get_paging_state() const;
|
||||
/** Serial consistency for conditional updates. */
|
||||
std::experimental::optional<db::consistency_level> get_serial_consistency() const {
|
||||
return get_specific_options().serial_consistency;
|
||||
}
|
||||
|
||||
api::timestamp_type get_timestamp(service::query_state& state) const {
|
||||
auto tstamp = get_specific_options().timestamp;
|
||||
return tstamp != api::missing_timestamp ? tstamp : state.get_timestamp();
|
||||
}
|
||||
|
||||
/**
|
||||
* The protocol version for the query. Will be 3 if the object don't come from
|
||||
* a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
|
||||
*/
|
||||
int get_protocol_version() const {
|
||||
return _cql_serialization_format.protocol_version();
|
||||
}
|
||||
|
||||
cql_serialization_format get_cql_serialization_format() const {
|
||||
return _cql_serialization_format;
|
||||
}
|
||||
|
||||
const query_options::specific_options& get_specific_options() const {
|
||||
return _options;
|
||||
}
|
||||
|
||||
// Mainly for the sake of BatchQueryOptions
|
||||
const query_options& for_statement(size_t i) const {
|
||||
if (!_batch_options) {
|
||||
// No per-statement options supplied, so use the "global" options
|
||||
return *this;
|
||||
}
|
||||
return _batch_options->at(i);
|
||||
}
|
||||
|
||||
std::experimental::optional<db::consistency_level> get_serial_consistency() const;
|
||||
|
||||
const std::experimental::optional<std::vector<sstring_view>>& get_names() const noexcept {
|
||||
return _names;
|
||||
}
|
||||
|
||||
api::timestamp_type get_timestamp(service::query_state& state) const;
|
||||
/**
|
||||
* The protocol version for the query. Will be 3 if the object don't come from
|
||||
* a native protocol request (i.e. it's been allocated locally or by CQL-over-thrift).
|
||||
*/
|
||||
int get_protocol_version() const;
|
||||
cql_serialization_format get_cql_serialization_format() const;
|
||||
// Mainly for the sake of BatchQueryOptions
|
||||
const specific_options& get_specific_options() const;
|
||||
const query_options& for_statement(size_t i) const;
|
||||
void prepare(const std::vector<::shared_ptr<column_specification>>& specs);
|
||||
private:
|
||||
void fill_value_views();
|
||||
|
||||
@@ -206,30 +206,6 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
|
||||
_cql_stats.secondary_index_rows_read,
|
||||
sm::description("Counts a total number of rows read during CQL requests performed using secondary indexes.")),
|
||||
|
||||
// read requests that required ALLOW FILTERING
|
||||
sm::make_derive(
|
||||
"filtered_read_requests",
|
||||
_cql_stats.filtered_reads,
|
||||
sm::description("Counts a total number of CQL read requests that required ALLOW FILTERING. See filtered_rows_read_total to compare how many rows needed to be filtered.")),
|
||||
|
||||
// rows read with filtering enabled (because ALLOW FILTERING was required)
|
||||
sm::make_derive(
|
||||
"filtered_rows_read_total",
|
||||
_cql_stats.filtered_rows_read_total,
|
||||
sm::description("Counts a total number of rows read during CQL requests that required ALLOW FILTERING. See filtered_rows_matched_total and filtered_rows_dropped_total for information how accurate filtering queries are.")),
|
||||
|
||||
// rows read with filtering enabled and accepted by the filter
|
||||
sm::make_derive(
|
||||
"filtered_rows_matched_total",
|
||||
_cql_stats.filtered_rows_matched_total,
|
||||
sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and accepted by the filter. Number similar to filtered_rows_read_total indicates that filtering is accurate.")),
|
||||
|
||||
// rows read with filtering enabled and rejected by the filter
|
||||
sm::make_derive(
|
||||
"filtered_rows_dropped_total",
|
||||
[this]() {return _cql_stats.filtered_rows_read_total - _cql_stats.filtered_rows_matched_total;},
|
||||
sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and dropped by the filter. Number similar to filtered_rows_read_total indicates that filtering is not accurate and might cause performance degradation.")),
|
||||
|
||||
sm::make_derive(
|
||||
"authorized_prepared_statements_cache_evictions",
|
||||
[] { return authorized_prepared_statements_cache::shard_stats().authorized_prepared_statements_cache_evictions; },
|
||||
@@ -243,17 +219,7 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
|
||||
sm::make_gauge(
|
||||
"user_prepared_auth_cache_footprint",
|
||||
[this] { return _authorized_prepared_cache.memory_footprint(); },
|
||||
sm::description("Size (in bytes) of the authenticated prepared statements cache.")),
|
||||
|
||||
sm::make_counter(
|
||||
"reverse_queries",
|
||||
_cql_stats.reverse_queries,
|
||||
sm::description("Counts number of CQL SELECT requests with ORDER BY DESC.")),
|
||||
|
||||
sm::make_counter(
|
||||
"unpaged_select_queries",
|
||||
_cql_stats.unpaged_select_queries,
|
||||
sm::description("Counts number of unpaged CQL SELECT requests.")),
|
||||
sm::description("Size (in bytes) of the authenticated prepared statements cache."))
|
||||
|
||||
});
|
||||
|
||||
|
||||
@@ -45,16 +45,12 @@
|
||||
#include "cql3/statements/request_validations.hh"
|
||||
#include "cql3/restrictions/primary_key_restrictions.hh"
|
||||
#include "cql3/statements/request_validations.hh"
|
||||
#include "cql3/restrictions/single_column_primary_key_restrictions.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
namespace restrictions {
|
||||
|
||||
class multi_column_restriction : public primary_key_restrictions<clustering_key_prefix> {
|
||||
private:
|
||||
bool _has_only_asc_columns;
|
||||
bool _has_only_desc_columns;
|
||||
protected:
|
||||
schema_ptr _schema;
|
||||
std::vector<const column_definition*> _column_defs;
|
||||
@@ -62,9 +58,7 @@ public:
|
||||
multi_column_restriction(schema_ptr schema, std::vector<const column_definition*>&& defs)
|
||||
: _schema(schema)
|
||||
, _column_defs(std::move(defs))
|
||||
{
|
||||
update_asc_desc_existence();
|
||||
}
|
||||
{ }
|
||||
|
||||
virtual bool is_multi_column() const override {
|
||||
return true;
|
||||
@@ -90,7 +84,6 @@ public:
|
||||
"Mixing single column relations and multi column relations on clustering columns is not allowed");
|
||||
auto as_pkr = static_pointer_cast<primary_key_restrictions<clustering_key_prefix>>(other);
|
||||
do_merge_with(as_pkr);
|
||||
update_asc_desc_existence();
|
||||
}
|
||||
|
||||
bool is_satisfied_by(const schema& schema,
|
||||
@@ -147,40 +140,6 @@ protected:
|
||||
|
||||
virtual bool is_supported_by(const secondary_index::index& index) const = 0;
|
||||
|
||||
/**
|
||||
* @return true if the restriction contains at least one column of each
|
||||
* ordering, false otherwise.
|
||||
*/
|
||||
bool is_mixed_order() const {
|
||||
return !is_desc_order() && !is_asc_order();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if all the restricted columns ordered in descending
|
||||
* order, false otherwise
|
||||
*/
|
||||
bool is_desc_order() const {
|
||||
return _has_only_desc_columns;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if all the restricted columns ordered in ascending
|
||||
* order, false otherwise
|
||||
*/
|
||||
bool is_asc_order() const {
|
||||
return _has_only_asc_columns;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* Updates the _has_only_asc_columns and _has_only_desc_columns fields.
|
||||
*/
|
||||
void update_asc_desc_existence() {
|
||||
std::size_t num_of_desc =
|
||||
std::count_if(_column_defs.begin(), _column_defs.end(), [] (const column_definition* cd) { return cd->type->is_reversed(); });
|
||||
_has_only_asc_columns = num_of_desc == 0;
|
||||
_has_only_desc_columns = num_of_desc == _column_defs.size();
|
||||
}
|
||||
#if 0
|
||||
/**
|
||||
* Check if this type of restriction is supported for the specified column by the specified index.
|
||||
@@ -426,7 +385,6 @@ protected:
|
||||
};
|
||||
|
||||
class multi_column_restriction::slice final : public multi_column_restriction {
|
||||
using restriction_shared_ptr = ::shared_ptr<primary_key_restrictions<clustering_key_prefix>>;
|
||||
private:
|
||||
term_slice _slice;
|
||||
|
||||
@@ -464,11 +422,24 @@ public:
|
||||
}
|
||||
|
||||
virtual std::vector<bounds_range_type> bounds_ranges(const query_options& options) const override {
|
||||
if (!is_mixed_order()) {
|
||||
return bounds_ranges_unified_order(options);
|
||||
} else {
|
||||
return bounds_ranges_mixed_order(options);
|
||||
// FIXME: doesn't work properly with mixed CLUSTERING ORDER (CASSANDRA-7281)
|
||||
auto read_bound = [&] (statements::bound b) -> std::experimental::optional<bounds_range_type::bound> {
|
||||
if (!has_bound(b)) {
|
||||
return {};
|
||||
}
|
||||
auto vals = component_bounds(b, options);
|
||||
for (unsigned i = 0; i < vals.size(); i++) {
|
||||
statements::request_validations::check_not_null(vals[i], "Invalid null value in condition for column %s", _column_defs.at(i)->name_as_text());
|
||||
}
|
||||
auto prefix = clustering_key_prefix::from_optional_exploded(*_schema, vals);
|
||||
return bounds_range_type::bound(prefix, is_inclusive(b));
|
||||
};
|
||||
auto range = wrapping_range<clustering_key_prefix>(read_bound(statements::bound::START), read_bound(statements::bound::END));
|
||||
auto bounds = bound_view::from_range(range);
|
||||
if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
|
||||
return { };
|
||||
}
|
||||
return { bounds_range_type(std::move(range)) };
|
||||
}
|
||||
#if 0
|
||||
@Override
|
||||
@@ -543,221 +514,6 @@ private:
|
||||
auto value = static_pointer_cast<tuples::value>(_slice.bound(b)->bind(options));
|
||||
return value->get_elements();
|
||||
}
|
||||
|
||||
std::vector<bytes_opt> read_bound_components(const query_options& options, statements::bound b) const {
|
||||
if (!has_bound(b)) {
|
||||
return {};
|
||||
}
|
||||
auto vals = component_bounds(b, options);
|
||||
for (unsigned i = 0; i < vals.size(); i++) {
|
||||
statements::request_validations::check_not_null(vals[i], "Invalid null value in condition for column %s", _column_defs.at(i)->name_as_text());
|
||||
}
|
||||
return vals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the bounds for the case that all clustering columns have the same order.
|
||||
* Having the same order implies we can do a prefix search on the data.
|
||||
* @param options the query options
|
||||
* @return the vector of ranges for the restriction
|
||||
*/
|
||||
std::vector<bounds_range_type> bounds_ranges_unified_order(const query_options& options) const {
|
||||
auto start_prefix = clustering_key_prefix::from_optional_exploded(*_schema, read_bound_components(options, statements::bound::START));
|
||||
auto start_bound = bounds_range_type::bound(std::move(start_prefix), is_inclusive(statements::bound::START));
|
||||
auto end_prefix = clustering_key_prefix::from_optional_exploded(*_schema, read_bound_components(options, statements::bound::END));
|
||||
auto end_bound = bounds_range_type::bound(std::move(end_prefix), is_inclusive(statements::bound::END));
|
||||
auto make_range = [&] () {
|
||||
if (is_asc_order()) {
|
||||
return bounds_range_type::make(start_bound, end_bound);
|
||||
} else {
|
||||
return bounds_range_type::make(end_bound, start_bound);
|
||||
}
|
||||
};
|
||||
auto range = make_range();
|
||||
auto bounds = bound_view::from_range(range);
|
||||
if (bound_view::compare(*_schema)(bounds.second, bounds.first)) {
|
||||
return { };
|
||||
}
|
||||
return { std::move(range) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the bounds when clustering columns are mixed order
|
||||
* (contains ASC and DESC together).
|
||||
* Having mixed order implies that a prefix search can't take place,
|
||||
* instead, the bounds have to be broken down to separate prefix serchable
|
||||
* ranges such that their combination is equivalent to the original range.
|
||||
* @param options the query options
|
||||
* @return the vector of ranges for the restriction
|
||||
*/
|
||||
std::vector<bounds_range_type> bounds_ranges_mixed_order(const query_options& options) const {
|
||||
std::vector<bounds_range_type> ret_ranges;
|
||||
auto mixed_order_restrictions = build_mixed_order_restriction_set(options);
|
||||
ret_ranges.reserve(mixed_order_restrictions.size());
|
||||
for (auto r : mixed_order_restrictions) {
|
||||
for (auto&& range : r->bounds_ranges(options)) {
|
||||
ret_ranges.emplace_back(std::move(range));
|
||||
}
|
||||
}
|
||||
return ret_ranges;
|
||||
}
|
||||
|
||||
/**
|
||||
* The function returns the first real inequality component.
|
||||
* The first real inequality is the index of the first component in the
|
||||
* tuple that will turn into a slice single column restriction.
|
||||
* For example: (a, b, c) > (0, 1, 2) and (a, b, c) < (0, 1, 5) will be
|
||||
* broken into one single column restriction set of the form:
|
||||
* a = 0 and b = 1 and c > 2 and c < 5 , c is the first element that has
|
||||
* inequality so for this case the function will return 2.
|
||||
* @param start_components - the components of the starts tuple range.
|
||||
* @param end_components - the components of the end tuple range.
|
||||
* @return an empty value if not found and the index of the first index that
|
||||
* will yield inequality
|
||||
*/
|
||||
std::optional<std::size_t> find_first_neq_component(std::vector<bytes_opt>& start_components,
|
||||
std::vector<bytes_opt>& end_components) const {
|
||||
size_t common_components_count = std::min(start_components.size(), end_components.size());
|
||||
for (size_t i = 0; i < common_components_count ; i++) {
|
||||
if (start_components[i].value() != end_components[i].value()) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
size_t max_components_count = std::max(start_components.size(), end_components.size());
|
||||
if (common_components_count < max_components_count) {
|
||||
return common_components_count;
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a single column restriction which is either slice or equality.
|
||||
* @param bound - if bound is empty this is an equality, if its either START or END ,
|
||||
* this is the corresponding slice restriction.
|
||||
* @param inclusive - is the slice inclusive (ignored for equality).
|
||||
* @param column_pos - the column position to restrict
|
||||
* @param value - the value to restrict the colum with.
|
||||
* @return a shared pointer to the just created restriction.
|
||||
*/
|
||||
::shared_ptr<restriction> make_single_column_restriction(std::optional<cql3::statements::bound> bound, bool inclusive,
|
||||
std::size_t column_pos,const bytes_opt& value) const {
|
||||
::shared_ptr<cql3::term> term = ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(value));
|
||||
if (!bound){
|
||||
return ::make_shared<cql3::restrictions::single_column_restriction::EQ>(*_column_defs[column_pos], term);
|
||||
} else {
|
||||
return ::make_shared<cql3::restrictions::single_column_restriction::slice>(*_column_defs[column_pos], bound.value(), inclusive, term);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper function to create a single column restrictions set from a tuple relation on
|
||||
* clustering keys.
|
||||
* i.e : (a,b,c) >= (0,1,2) will become:
|
||||
* 1.a > 0
|
||||
* 2. a = 0 and b > 1
|
||||
* 3. a = 0 and b = 1 and c >=2
|
||||
* @param bound - determines if the operator is '>' (START) or '<' (END)
|
||||
* @param bound_inclusive - determines if to append equality to the operator i.e: if > becomes >=
|
||||
* @param bound_values - the tuple values for the restriction
|
||||
* @param first_neq_component - the first component that will have inequality.
|
||||
* for the example above, if this parameter is 1, only restrictions 2 and 3 will be created.
|
||||
* this parameter helps to facilitate the nuances of breaking more complex relations, for example when
|
||||
* there is in existence a second condition limiting the other side of the bound
|
||||
* i.e:(a,b,c) >= (0,1,2) and (a,b,c) < (5,6,7), this will require each bound to use the parameter.
|
||||
* @return the single column restriction set built according to the above parameters.
|
||||
*/
|
||||
std::vector<restriction_shared_ptr> make_single_bound_restrictions(statements::bound bound, bool bound_inclusive,
|
||||
std::vector<bytes_opt>& bound_values,
|
||||
std::size_t first_neq_component) const{
|
||||
std::vector<restriction_shared_ptr> ret;
|
||||
std::size_t num_of_restrictions = bound_values.size() - first_neq_component;
|
||||
ret.reserve(num_of_restrictions);
|
||||
for (std::size_t i = 0;i < num_of_restrictions ; i++) {
|
||||
ret.emplace_back(::make_shared<cql3::restrictions::single_column_primary_key_restrictions<clustering_key>>(_schema, false));
|
||||
std::size_t neq_component_idx = first_neq_component + i;
|
||||
for (std::size_t j = 0;j < neq_component_idx; j++) {
|
||||
ret[i]->merge_with(make_single_column_restriction(std::nullopt, false, j, bound_values[j]));
|
||||
}
|
||||
bool inclusive = (i == (num_of_restrictions-1)) && bound_inclusive;
|
||||
ret[i]->merge_with(make_single_column_restriction(bound, inclusive, neq_component_idx, bound_values[neq_component_idx]));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds and returns a set of restrictions such that the union of their ranges (the restrictions OR-ed together)
|
||||
* is logically identical to this restriction, with the additional property that it can execute
|
||||
* correctly when the clustering columns are with "mixed order" - contains ASC and DESC orderings.
|
||||
* for more information: https://github.com/scylladb/scylla/issues/2050
|
||||
* @param options - the query options
|
||||
* @return set of restrictions which their ranges union is logically identical to this restriction.
|
||||
*/
|
||||
std::vector<::shared_ptr<primary_key_restrictions<clustering_key_prefix>>>
|
||||
build_mixed_order_restriction_set(const query_options& options) const {
|
||||
std::vector<restriction_shared_ptr> ret;
|
||||
auto start_components = read_bound_components(options, statements::bound::START);
|
||||
auto end_components = read_bound_components(options, statements::bound::END);
|
||||
bool start_inclusive = is_inclusive(statements::bound::START);
|
||||
bool end_inclusive = is_inclusive(statements::bound::END);
|
||||
std::optional<std::size_t> first_neq_component = std::nullopt;
|
||||
|
||||
// find the first index of the first component that is not equal between the tuples.
|
||||
if (start_components.empty() || end_components.empty()) {
|
||||
first_neq_component = 0;
|
||||
} else {
|
||||
auto tuple_mismatch = std::mismatch(start_components.begin(), start_components.end(),
|
||||
end_components.begin(), end_components.end());
|
||||
if ((tuple_mismatch.first != start_components.end()) ||
|
||||
(tuple_mismatch.second != end_components.end())) {
|
||||
first_neq_component = std::distance(start_components.begin(), tuple_mismatch.first);
|
||||
}
|
||||
}
|
||||
|
||||
// this is either a simple equality or a never fulfilled restriction
|
||||
if (!first_neq_component && start_inclusive && end_inclusive) {
|
||||
// This is a simple equality case
|
||||
shared_ptr<cql3::term> term = ::make_shared<cql3::tuples::value>(start_components);
|
||||
ret.emplace_back(::make_shared<cql3::restrictions::multi_column_restriction::EQ>(_schema, _column_defs, term));
|
||||
return ret;
|
||||
} else if (!first_neq_component) {
|
||||
// This is a contradiction case
|
||||
return {};
|
||||
} else if ((*first_neq_component == end_components.size() && !end_inclusive ) ||
|
||||
(*first_neq_component == start_components.size() && !start_inclusive )) {
|
||||
// This is a case where one bound is a prefix of the other. If this prefix bound
|
||||
// is not inclusive the result will be an empty set.
|
||||
return {};
|
||||
}
|
||||
|
||||
bool start_components_exists = (start_components.size() - first_neq_component.value()) > 0;
|
||||
bool end_components_exists = (end_components.size() - first_neq_component.value()) > 0;
|
||||
bool both_components_exists = start_components_exists && end_components_exists;
|
||||
if (start_components_exists) {
|
||||
auto restrictions =
|
||||
make_single_bound_restrictions(statements::bound::START, start_inclusive, start_components, first_neq_component.value());
|
||||
for (auto&& r : restrictions) {
|
||||
ret.emplace_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
if (end_components_exists) {
|
||||
auto restrictions =
|
||||
make_single_bound_restrictions(statements::bound::END, end_inclusive,
|
||||
end_components, first_neq_component.value() + both_components_exists);
|
||||
for (auto&& r : restrictions) {
|
||||
ret.emplace_back(r);
|
||||
}
|
||||
}
|
||||
|
||||
if (both_components_exists) {
|
||||
bool inclusive = end_inclusive && ((end_components.size() - first_neq_component.value()) == 1);
|
||||
ret[0]->merge_with(make_single_column_restriction(statements::bound::END, inclusive, first_neq_component.value(),
|
||||
end_components[first_neq_component.value()]));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -88,7 +88,6 @@ public:
|
||||
|
||||
using restrictions::uses_function;
|
||||
using restrictions::has_supporting_index;
|
||||
using restrictions::values;
|
||||
|
||||
bool empty() const override {
|
||||
return get_column_defs().empty();
|
||||
@@ -96,72 +95,7 @@ public:
|
||||
uint32_t size() const override {
|
||||
return uint32_t(get_column_defs().size());
|
||||
}
|
||||
|
||||
bool has_unrestricted_components(const schema& schema) const;
|
||||
|
||||
virtual bool needs_filtering(const schema& schema) const;
|
||||
|
||||
// How long a prefix of the restrictions could have resulted in
|
||||
// need_filtering() == false. These restrictions do not need to be
|
||||
// applied during filtering.
|
||||
// For example, if we have the filter "c1 < 3 and c2 > 3", c1 does
|
||||
// not need filtering (just a read stopping at c1=3) but c2 does,
|
||||
// so num_prefix_columns_that_need_not_be_filtered() will be 1.
|
||||
virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual bool is_all_eq() const {
|
||||
return false;
|
||||
}
|
||||
virtual size_t prefix_size() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t prefix_size(const schema_ptr schema) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<partition_key>::has_unrestricted_components(const schema& schema) const {
|
||||
return size() < schema.partition_key_size();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<clustering_key>::has_unrestricted_components(const schema& schema) const {
|
||||
return size() < schema.clustering_key_size();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
|
||||
return !empty() && !is_on_token() && (has_unrestricted_components(schema) || is_contains() || is_slice());
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
|
||||
// Currently only overloaded single_column_primary_key_restrictions will require ALLOW FILTERING
|
||||
return false;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
|
||||
size_t count = 0;
|
||||
if (schema->clustering_key_columns().empty()) {
|
||||
return count;
|
||||
}
|
||||
auto column_defs = get_column_defs();
|
||||
column_id expected_column_id = schema->clustering_key_columns().begin()->id;
|
||||
for (auto&& cdef : column_defs) {
|
||||
if (schema->position(*cdef) != expected_column_id) {
|
||||
return count;
|
||||
}
|
||||
expected_column_id++;
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,10 +68,6 @@ public:
|
||||
|
||||
virtual std::vector<bytes_opt> values(const query_options& options) const = 0;
|
||||
|
||||
virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const {
|
||||
throw exceptions::invalid_request_exception("Single value can be obtained from single-column restrictions only");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> if one of the restrictions use the specified function.
|
||||
*
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
#include <boost/algorithm/cxx11/all_of.hpp>
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <boost/range/adaptor/filtered.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
@@ -63,8 +62,6 @@ class single_column_primary_key_restrictions : public primary_key_restrictions<V
|
||||
using range_type = query::range<ValueType>;
|
||||
using range_bound = typename range_type::bound;
|
||||
using bounds_range_type = typename primary_key_restrictions<ValueType>::bounds_range_type;
|
||||
template<typename OtherValueType>
|
||||
friend class single_column_primary_key_restrictions;
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
bool _allow_filtering;
|
||||
@@ -82,27 +79,6 @@ public:
|
||||
, _in(false)
|
||||
{ }
|
||||
|
||||
// Convert another primary key restrictions type into this type, possibly using different schema
|
||||
template<typename OtherValueType>
|
||||
explicit single_column_primary_key_restrictions(schema_ptr schema, const single_column_primary_key_restrictions<OtherValueType>& other)
|
||||
: _schema(schema)
|
||||
, _allow_filtering(other._allow_filtering)
|
||||
, _restrictions(::make_shared<single_column_restrictions>(schema))
|
||||
, _slice(other._slice)
|
||||
, _contains(other._contains)
|
||||
, _in(other._in)
|
||||
{
|
||||
for (const auto& entry : other._restrictions->restrictions()) {
|
||||
const column_definition* other_cdef = entry.first;
|
||||
const column_definition* this_cdef = _schema->get_column_definition(other_cdef->name());
|
||||
if (!this_cdef) {
|
||||
throw exceptions::invalid_request_exception(sprint("Base column %s not found in view index schema", other_cdef->name_as_text()));
|
||||
}
|
||||
::shared_ptr<single_column_restriction> restriction = entry.second;
|
||||
_restrictions->add_restriction(restriction->apply_to(*this_cdef));
|
||||
}
|
||||
}
|
||||
|
||||
virtual bool is_on_token() const override {
|
||||
return false;
|
||||
}
|
||||
@@ -123,10 +99,6 @@ public:
|
||||
return _in;
|
||||
}
|
||||
|
||||
virtual bool is_all_eq() const override {
|
||||
return _restrictions->is_all_eq();
|
||||
}
|
||||
|
||||
virtual bool has_bound(statements::bound b) const override {
|
||||
return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
|
||||
}
|
||||
@@ -165,25 +137,6 @@ public:
|
||||
_restrictions->add_restriction(restriction);
|
||||
}
|
||||
|
||||
virtual size_t prefix_size() const override {
|
||||
return primary_key_restrictions<ValueType>::prefix_size(_schema);
|
||||
}
|
||||
|
||||
::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
|
||||
static_assert(std::is_same_v<ValueType, clustering_key>, "Only clustering key can produce longest prefix restrictions");
|
||||
size_t current_prefix_size = prefix_size();
|
||||
if (current_prefix_size == _restrictions->restrictions().size()) {
|
||||
return dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(this->shared_from_this());
|
||||
}
|
||||
|
||||
auto longest_prefix_restrictions = ::make_shared<single_column_primary_key_restrictions<clustering_key>>(_schema, _allow_filtering);
|
||||
auto restriction_it = _restrictions->restrictions().begin();
|
||||
for (size_t i = 0; i < current_prefix_size; ++i) {
|
||||
longest_prefix_restrictions->merge_with((restriction_it++)->second);
|
||||
}
|
||||
return longest_prefix_restrictions;
|
||||
}
|
||||
|
||||
virtual void merge_with(::shared_ptr<restriction> restriction) override {
|
||||
if (restriction->is_multi_column()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
@@ -356,20 +309,11 @@ public:
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
|
||||
return _restrictions->value_for(cdef, options);
|
||||
}
|
||||
|
||||
std::vector<bytes_opt> bounds(statements::bound b, const query_options& options) const override {
|
||||
// TODO: if this proved to be required.
|
||||
fail(unimplemented::cause::LEGACY_COMPOSITE_KEYS); // not 100% correct...
|
||||
}
|
||||
|
||||
const single_column_restrictions::restrictions_map& restrictions() const {
|
||||
return _restrictions->restrictions();
|
||||
}
|
||||
|
||||
virtual bool has_supporting_index(const secondary_index::secondary_index_manager& index_manager) const override {
|
||||
return _restrictions->has_supporting_index(index_manager);
|
||||
}
|
||||
@@ -405,13 +349,10 @@ public:
|
||||
_restrictions->restrictions() | boost::adaptors::map_values,
|
||||
[&] (auto&& r) { return r->is_satisfied_by(schema, key, ckey, cells, options, now); });
|
||||
}
|
||||
|
||||
virtual bool needs_filtering(const schema& schema) const override;
|
||||
virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const override;
|
||||
};
|
||||
|
||||
template<>
|
||||
inline dht::partition_range_vector
|
||||
dht::partition_range_vector
|
||||
single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query_options& options) const {
|
||||
dht::partition_range_vector ranges;
|
||||
ranges.reserve(size());
|
||||
@@ -429,7 +370,7 @@ single_column_primary_key_restrictions<partition_key>::bounds_ranges(const query
|
||||
}
|
||||
|
||||
template<>
|
||||
inline std::vector<query::clustering_range>
|
||||
std::vector<query::clustering_range>
|
||||
single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(const query_options& options) const {
|
||||
auto wrapping_bounds = compute_bounds(options);
|
||||
auto bounds = boost::copy_range<query::clustering_row_ranges>(wrapping_bounds
|
||||
@@ -465,62 +406,6 @@ single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(con
|
||||
return bounds;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool single_column_primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
|
||||
return primary_key_restrictions<partition_key>::needs_filtering(schema);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
|
||||
// Restrictions currently need filtering in three cases:
|
||||
// 1. any of them is a CONTAINS restriction
|
||||
// 2. restrictions do not form a contiguous prefix (i.e. there are gaps in it)
|
||||
// 3. a SLICE restriction isn't on a last place
|
||||
column_id position = 0;
|
||||
for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
|
||||
if (restriction->is_contains() || position != restriction->get_column_def().id) {
|
||||
return true;
|
||||
}
|
||||
if (!restriction->is_slice()) {
|
||||
position = restriction->get_column_def().id + 1;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// How many of the restrictions (in column order) do not need filtering
|
||||
// because they are implemented as a slice (potentially, a contiguous disk
|
||||
// read). For example, if we have the filter "c1 < 3 and c2 > 3", c1 does not
|
||||
// need filtering but c2 does so num_prefix_columns_that_need_not_be_filtered
|
||||
// will be 1.
|
||||
// The implementation of num_prefix_columns_that_need_not_be_filtered() is
|
||||
// closely tied to that of needs_filtering() above - basically, if only the
|
||||
// first num_prefix_columns_that_need_not_be_filtered() restrictions existed,
|
||||
// then needs_filtering() would have returned false.
|
||||
template<>
|
||||
inline unsigned single_column_primary_key_restrictions<clustering_key>::num_prefix_columns_that_need_not_be_filtered() const {
|
||||
column_id position = 0;
|
||||
unsigned int count = 0;
|
||||
for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
|
||||
if (restriction->is_contains() || position != restriction->get_column_def().id) {
|
||||
return count;
|
||||
}
|
||||
if (!restriction->is_slice()) {
|
||||
position = restriction->get_column_def().id + 1;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline unsigned single_column_primary_key_restrictions<partition_key>::num_prefix_columns_that_need_not_be_filtered() const {
|
||||
// skip_filtering() is currently called only for clustering key
|
||||
// restrictions, so it doesn't matter what we return here.
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -93,9 +93,6 @@ public:
|
||||
}
|
||||
|
||||
virtual bool is_supported_by(const secondary_index::index& index) const = 0;
|
||||
using abstract_restriction::is_satisfied_by;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const = 0;
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) = 0;
|
||||
#if 0
|
||||
/**
|
||||
* Check if this type of restriction is supported by the specified index.
|
||||
@@ -169,10 +166,6 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
|
||||
return ::make_shared<EQ>(cdef, _value);
|
||||
}
|
||||
|
||||
#if 0
|
||||
@Override
|
||||
@@ -208,10 +201,6 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
|
||||
throw std::logic_error("IN superclass should never be cloned directly");
|
||||
}
|
||||
|
||||
virtual std::vector<bytes_opt> values_raw(const query_options& options) const = 0;
|
||||
|
||||
@@ -254,10 +243,6 @@ public:
|
||||
virtual sstring to_string() const override {
|
||||
return sprint("IN(%s)", std::to_string(_values));
|
||||
}
|
||||
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
|
||||
return ::make_shared<IN_with_values>(cdef, _values);
|
||||
}
|
||||
};
|
||||
|
||||
class single_column_restriction::IN_with_marker : public IN {
|
||||
@@ -283,10 +268,6 @@ public:
|
||||
virtual sstring to_string() const override {
|
||||
return "IN ?";
|
||||
}
|
||||
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
|
||||
return ::make_shared<IN_with_marker>(cdef, _marker);
|
||||
}
|
||||
};
|
||||
|
||||
class single_column_restriction::slice : public single_column_restriction {
|
||||
@@ -298,11 +279,6 @@ public:
|
||||
, _slice(term_slice::new_instance(bound, inclusive, std::move(term)))
|
||||
{ }
|
||||
|
||||
slice(const column_definition& column_def, term_slice slice)
|
||||
: single_column_restriction(column_def)
|
||||
, _slice(slice)
|
||||
{ }
|
||||
|
||||
virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
|
||||
return (_slice.has_bound(statements::bound::START) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::START), ks_name, function_name))
|
||||
|| (_slice.has_bound(statements::bound::END) && abstract_restriction::term_uses_function(_slice.bound(statements::bound::END), ks_name, function_name));
|
||||
@@ -388,10 +364,6 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
|
||||
return ::make_shared<slice>(cdef, _slice);
|
||||
}
|
||||
};
|
||||
|
||||
// This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
|
||||
@@ -513,10 +485,6 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
virtual ::shared_ptr<single_column_restriction> apply_to(const column_definition& cdef) override {
|
||||
throw std::logic_error("Cloning 'contains' restriction is not implemented.");
|
||||
}
|
||||
|
||||
#if 0
|
||||
private List<ByteBuffer> keys(const query_options& options) {
|
||||
|
||||
@@ -111,11 +111,6 @@ public:
|
||||
return r;
|
||||
}
|
||||
|
||||
virtual bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
|
||||
auto it = _restrictions.find(std::addressof(cdef));
|
||||
return (it != _restrictions.end()) ? it->second->value(options) : bytes_opt{};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the restriction associated to the specified column.
|
||||
*
|
||||
|
||||
@@ -23,7 +23,6 @@
|
||||
#include <boost/range/algorithm/transform.hpp>
|
||||
#include <boost/range/algorithm.hpp>
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <boost/algorithm/cxx11/any_of.hpp>
|
||||
|
||||
#include "statement_restrictions.hh"
|
||||
#include "single_column_primary_key_restrictions.hh"
|
||||
@@ -37,8 +36,6 @@
|
||||
namespace cql3 {
|
||||
namespace restrictions {
|
||||
|
||||
static logging::logger rlogger("restrictions");
|
||||
|
||||
using boost::adaptors::filtered;
|
||||
using boost::adaptors::transformed;
|
||||
|
||||
@@ -72,9 +69,6 @@ public:
|
||||
// throw? should not reach?
|
||||
return {};
|
||||
}
|
||||
bytes_opt value_for(const column_definition& cdef, const query_options& options) const override {
|
||||
return {};
|
||||
}
|
||||
std::vector<T> values_as_keys(const query_options& options) const override {
|
||||
// throw? should not reach?
|
||||
return {};
|
||||
@@ -208,22 +202,23 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
throw exceptions::invalid_request_exception(sprint("restriction '%s' is only supported in materialized view creation", relation->to_string()));
|
||||
}
|
||||
} else {
|
||||
add_restriction(relation->to_restriction(db, schema, bound_names), for_view, allow_filtering);
|
||||
add_restriction(relation->to_restriction(db, schema, bound_names));
|
||||
}
|
||||
}
|
||||
}
|
||||
auto& cf = db.find_column_family(schema);
|
||||
auto& sim = cf.get_index_manager();
|
||||
const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
|
||||
const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
|
||||
const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim);
|
||||
bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
|
||||
bool has_queriable_index = has_queriable_clustering_column_index
|
||||
|| _partition_key_restrictions->has_supporting_index(sim)
|
||||
|| _nonprimary_key_restrictions->has_supporting_index(sim);
|
||||
|
||||
// At this point, the select statement if fully constructed, but we still have a few things to validate
|
||||
process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
|
||||
process_partition_key_restrictions(has_queriable_index, for_view);
|
||||
|
||||
// Some but not all of the partition key columns have been specified;
|
||||
// hence we need turn these restrictions into index expressions.
|
||||
if (_uses_secondary_indexing || _partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
if (_uses_secondary_indexing) {
|
||||
_index_restrictions.push_back(_partition_key_restrictions);
|
||||
}
|
||||
|
||||
@@ -239,14 +234,13 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
}
|
||||
}
|
||||
|
||||
process_clustering_columns_restrictions(has_queriable_clustering_column_index, select_a_collection, for_view, allow_filtering);
|
||||
process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view);
|
||||
|
||||
// Covers indexes on the first clustering column (among others).
|
||||
if (_is_key_range && has_queriable_clustering_column_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
}
|
||||
if (_is_key_range && has_queriable_clustering_column_index)
|
||||
_uses_secondary_indexing = true;
|
||||
|
||||
if (_uses_secondary_indexing || _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
if (_uses_secondary_indexing) {
|
||||
_index_restrictions.push_back(_clustering_columns_restrictions);
|
||||
} else if (_clustering_columns_restrictions->is_contains()) {
|
||||
fail(unimplemented::cause::INDEXES);
|
||||
@@ -275,48 +269,31 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
uses_secondary_indexing = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Even if uses_secondary_indexing is false at this point, we'll still have to use one if
|
||||
// there is restrictions not covered by the PK.
|
||||
if (!_nonprimary_key_restrictions->empty()) {
|
||||
if (has_queriable_regular_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
"thus may have unpredictable performance. If you want to execute "
|
||||
"this query despite the performance unpredictability, use ALLOW FILTERING");
|
||||
}
|
||||
_uses_secondary_indexing = true;
|
||||
_index_restrictions.push_back(_nonprimary_key_restrictions);
|
||||
}
|
||||
|
||||
if (_uses_secondary_indexing && !(for_view || allow_filtering)) {
|
||||
if (_uses_secondary_indexing && !for_view) {
|
||||
validate_secondary_index_selections(selects_only_static_columns);
|
||||
}
|
||||
}
|
||||
|
||||
void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering) {
|
||||
void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction) {
|
||||
if (restriction->is_multi_column()) {
|
||||
_clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
|
||||
} else if (restriction->is_on_token()) {
|
||||
_partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
|
||||
} else {
|
||||
add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction), for_view, allow_filtering);
|
||||
add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction));
|
||||
}
|
||||
}
|
||||
|
||||
void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering) {
|
||||
void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction) {
|
||||
auto& def = restriction->get_column_def();
|
||||
if (def.is_partition_key()) {
|
||||
// A SELECT query may not request a slice (range) of partition keys
|
||||
// without using token(). This is because there is no way to do this
|
||||
// query efficiently: mumur3 turns a contiguous range of partition
|
||||
// keys into tokens all over the token space.
|
||||
// However, in a SELECT statement used to define a materialized view,
|
||||
// such a slice is fine - it is used to check whether individual
|
||||
// partitions, match, and does not present a performance problem.
|
||||
assert(!restriction->is_on_token());
|
||||
if (restriction->is_slice() && !for_view && !allow_filtering) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Only EQ and IN relation are supported on the partition key (unless you use the token() function or allow filtering)");
|
||||
}
|
||||
_partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
|
||||
} else if (def.is_clustering_key()) {
|
||||
_clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
|
||||
@@ -335,54 +312,7 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
|
||||
return _index_restrictions;
|
||||
}
|
||||
|
||||
std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
|
||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
|
||||
for (const auto& cdef : restriction->get_column_defs()) {
|
||||
for (auto index : sim.list_indexes()) {
|
||||
if (index.depends_on(*cdef)) {
|
||||
return std::make_optional<secondary_index::index>(std::move(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
|
||||
std::vector<const column_definition*> column_defs_for_filtering;
|
||||
if (need_filtering()) {
|
||||
auto& sim = db.find_column_family(_schema).get_index_manager();
|
||||
std::optional<secondary_index::index> opt_idx = find_idx(sim);
|
||||
auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
|
||||
return opt_idx && opt_idx->depends_on(*cdef);
|
||||
};
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
|
||||
if (!column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
|
||||
if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
|
||||
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
|
||||
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||
if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
|
||||
if (!column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
return column_defs_for_filtering;
|
||||
}
|
||||
|
||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
|
||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view) {
|
||||
// If there is a queriable index, no special condition are required on the other restrictions.
|
||||
// But we still need to know 2 things:
|
||||
// - If we don't have a queriable index, is the query ok
|
||||
@@ -391,32 +321,39 @@ void statement_restrictions::process_partition_key_restrictions(bool has_queriab
|
||||
// components must have a EQ. Only the last partition key component can be in IN relation.
|
||||
if (_partition_key_restrictions->is_on_token()) {
|
||||
_is_key_range = true;
|
||||
} else if (_partition_key_restrictions->has_unrestricted_components(*_schema)) {
|
||||
_is_key_range = true;
|
||||
_uses_secondary_indexing = has_queriable_index;
|
||||
}
|
||||
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
if (!allow_filtering && !for_view && !has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
"thus may have unpredictable performance. If you want to execute "
|
||||
"this query despite the performance unpredictability, use ALLOW FILTERING");
|
||||
} else if (has_partition_key_unrestricted_components()) {
|
||||
if (!_partition_key_restrictions->empty() && !for_view) {
|
||||
if (!has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception(sprint("Partition key parts: %s must be restricted as other parts are",
|
||||
join(", ", get_partition_key_unrestricted_components())));
|
||||
}
|
||||
}
|
||||
|
||||
_is_key_range = true;
|
||||
_uses_secondary_indexing = has_queriable_index;
|
||||
}
|
||||
|
||||
if (_partition_key_restrictions->is_slice() && !_partition_key_restrictions->is_on_token() && !for_view) {
|
||||
// A SELECT query may not request a slice (range) of partition keys
|
||||
// without using token(). This is because there is no way to do this
|
||||
// query efficiently: mumur3 turns a contiguous range of partition
|
||||
// keys into tokens all over the token space.
|
||||
// However, in a SELECT statement used to define a materialized view,
|
||||
// such a slice is fine - it is used to check whether individual
|
||||
// partitions, match, and does not present a performance problem.
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
|
||||
}
|
||||
}
|
||||
|
||||
bool statement_restrictions::has_partition_key_unrestricted_components() const {
|
||||
return _partition_key_restrictions->has_unrestricted_components(*_schema);
|
||||
return _partition_key_restrictions->size() < _schema->partition_key_size();
|
||||
}
|
||||
|
||||
bool statement_restrictions::has_unrestricted_clustering_columns() const {
|
||||
return _clustering_columns_restrictions->has_unrestricted_components(*_schema);
|
||||
return _clustering_columns_restrictions->size() < _schema->clustering_key_size();
|
||||
}
|
||||
|
||||
void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering) {
|
||||
void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view) {
|
||||
if (!has_clustering_columns_restriction()) {
|
||||
return;
|
||||
}
|
||||
@@ -425,36 +362,38 @@ void statement_restrictions::process_clustering_columns_restrictions(bool has_qu
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Cannot restrict clustering columns by IN relations when a collection is selected by the query");
|
||||
}
|
||||
if (_clustering_columns_restrictions->is_contains() && !has_queriable_index && !allow_filtering) {
|
||||
if (_clustering_columns_restrictions->is_contains() && !has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Cannot restrict clustering columns by a CONTAINS relation without a secondary index or filtering");
|
||||
"Cannot restrict clustering columns by a CONTAINS relation without a secondary index");
|
||||
}
|
||||
|
||||
if (has_clustering_columns_restriction() && _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
if (has_queriable_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering && !for_view) {
|
||||
auto clustering_columns_iter = _schema->clustering_key_columns().begin();
|
||||
for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
|
||||
const column_definition* clustering_column = &(*clustering_columns_iter);
|
||||
++clustering_columns_iter;
|
||||
if (clustering_column != restricted_column) {
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
|
||||
restricted_column->name_as_text(), clustering_column->name_as_text()));
|
||||
}
|
||||
auto clustering_columns_iter = _schema->clustering_key_columns().begin();
|
||||
|
||||
for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
|
||||
const column_definition* clustering_column = &(*clustering_columns_iter);
|
||||
++clustering_columns_iter;
|
||||
|
||||
if (clustering_column != restricted_column && !for_view) {
|
||||
if (!has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
|
||||
restricted_column->name_as_text(), clustering_column->name_as_text()));
|
||||
}
|
||||
|
||||
_uses_secondary_indexing = true; // handle gaps and non-keyrange cases.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (_clustering_columns_restrictions->is_contains()) {
|
||||
_uses_secondary_indexing = true;
|
||||
}
|
||||
}
|
||||
|
||||
dht::partition_range_vector statement_restrictions::get_partition_key_ranges(const query_options& options) const {
|
||||
if (_partition_key_restrictions->empty()) {
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
}
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
}
|
||||
return _partition_key_restrictions->bounds_ranges(options);
|
||||
}
|
||||
|
||||
@@ -462,40 +401,18 @@ std::vector<query::clustering_range> statement_restrictions::get_clustering_boun
|
||||
if (_clustering_columns_restrictions->empty()) {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
}
|
||||
if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
if (auto single_ck_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
|
||||
return single_ck_restrictions->get_longest_prefix_restrictions()->bounds_ranges(options);
|
||||
}
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
}
|
||||
return _clustering_columns_restrictions->bounds_ranges(options);
|
||||
}
|
||||
|
||||
bool statement_restrictions::need_filtering() const {
|
||||
uint32_t number_of_restricted_columns_for_indexing = 0;
|
||||
bool statement_restrictions::need_filtering() {
|
||||
uint32_t number_of_restricted_columns = 0;
|
||||
for (auto&& restrictions : _index_restrictions) {
|
||||
number_of_restricted_columns_for_indexing += restrictions->size();
|
||||
number_of_restricted_columns += restrictions->size();
|
||||
}
|
||||
|
||||
int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
|
||||
// If the whole partition key is restricted, it does not imply filtering
|
||||
if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
|
||||
number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
|
||||
} else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
|
||||
number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
|
||||
}
|
||||
|
||||
if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
|
||||
// TODO(sarna): Implement ALLOW FILTERING support for multi-column restrictions - return false for now
|
||||
// in order to ensure backwards compatibility
|
||||
return false;
|
||||
}
|
||||
|
||||
return number_of_restricted_columns_for_indexing > 1
|
||||
|| (number_of_restricted_columns_for_indexing == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
|
||||
|| (number_of_restricted_columns_for_indexing != 0 && _nonprimary_key_restrictions->has_multiple_contains())
|
||||
|| (number_of_restricted_columns_for_indexing != 0 && !_uses_secondary_indexing)
|
||||
|| (_uses_secondary_indexing && number_of_filtering_restrictions > 1);
|
||||
return number_of_restricted_columns > 1
|
||||
|| (number_of_restricted_columns == 0 && has_clustering_columns_restriction())
|
||||
|| (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains());
|
||||
}
|
||||
|
||||
void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
|
||||
@@ -513,33 +430,6 @@ void statement_restrictions::validate_secondary_index_selections(bool selects_on
|
||||
}
|
||||
}
|
||||
|
||||
const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_partition_key_restrictions() const {
|
||||
static single_column_restrictions::restrictions_map empty;
|
||||
auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<partition_key>>(_partition_key_restrictions);
|
||||
if (!single_restrictions) {
|
||||
if (dynamic_pointer_cast<initial_key_restrictions<partition_key>>(_partition_key_restrictions)) {
|
||||
return empty;
|
||||
}
|
||||
throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
|
||||
}
|
||||
return single_restrictions->restrictions();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
|
||||
*/
|
||||
const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_clustering_key_restrictions() const {
|
||||
static single_column_restrictions::restrictions_map empty;
|
||||
auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions);
|
||||
if (!single_restrictions) {
|
||||
if (dynamic_pointer_cast<initial_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
|
||||
return empty;
|
||||
}
|
||||
throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
|
||||
}
|
||||
return single_restrictions->restrictions();
|
||||
}
|
||||
|
||||
static std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
|
||||
const column_definition& cdef,
|
||||
const partition_key& key,
|
||||
@@ -592,14 +482,6 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operand = value(options);
|
||||
return operand && _column_def.type->compare(*operand, data) == 0;
|
||||
}
|
||||
|
||||
bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
@@ -621,16 +503,6 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
|
||||
});
|
||||
}
|
||||
|
||||
bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operands = values(options);
|
||||
return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
|
||||
return operand && _column_def.type->compare(*operand, data) == 0;
|
||||
});
|
||||
}
|
||||
|
||||
static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options) {
|
||||
using range_type = query::range<bytes_view>;
|
||||
auto extract_bound = [&] (statements::bound bound) -> stdx::optional<range_type::bound> {
|
||||
@@ -641,8 +513,7 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
|
||||
if (!value) {
|
||||
return { };
|
||||
}
|
||||
auto value_view = options.linearize(*value);
|
||||
return { range_type::bound(value_view, slice.is_inclusive(bound)) };
|
||||
return { range_type::bound(*value, slice.is_inclusive(bound)) };
|
||||
};
|
||||
return range_type(
|
||||
extract_bound(statements::bound::START),
|
||||
@@ -667,13 +538,6 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
|
||||
});
|
||||
}
|
||||
|
||||
bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
return to_range(_slice, options).contains(data, _column_def.type->underlying_type()->as_tri_comparator());
|
||||
}
|
||||
|
||||
bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
@@ -707,12 +571,10 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
if (!val) {
|
||||
continue;
|
||||
}
|
||||
auto found = with_linearized(*val, [&] (bytes_view bv) {
|
||||
return std::find_if(elements.begin(), end, [&] (auto&& element) {
|
||||
auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
|
||||
return element.second.value().with_linearized([&] (bytes_view value_bv) {
|
||||
return element_type->compare(value_bv, bv) == 0;
|
||||
return element_type->compare(value_bv, *val) == 0;
|
||||
});
|
||||
});
|
||||
});
|
||||
if (found == end) {
|
||||
return false;
|
||||
@@ -723,10 +585,8 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
if (!k) {
|
||||
continue;
|
||||
}
|
||||
auto found = with_linearized(*k, [&] (bytes_view bv) {
|
||||
return std::find_if(elements.begin(), end, [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first, bv) == 0;
|
||||
});
|
||||
auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first, *k) == 0;
|
||||
});
|
||||
if (found == end) {
|
||||
return false;
|
||||
@@ -738,18 +598,14 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
if (!map_key || !map_value) {
|
||||
continue;
|
||||
}
|
||||
auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
|
||||
return std::find_if(elements.begin(), end, [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first, map_key_bv) == 0;
|
||||
});
|
||||
auto found = std::find_if(elements.begin(), end, [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first, *map_key) == 0;
|
||||
});
|
||||
if (found == end) {
|
||||
return false;
|
||||
}
|
||||
auto cmp = with_linearized(*map_value, [&] (bytes_view map_value_bv) {
|
||||
return found->second.value().with_linearized([&] (bytes_view value_bv) {
|
||||
return element_type->compare(value_bv, map_value_bv);
|
||||
});
|
||||
auto cmp = found->second.value().with_linearized([&] (bytes_view value_bv) {
|
||||
return element_type->compare(value_bv, *map_value);
|
||||
});
|
||||
if (cmp != 0) {
|
||||
return false;
|
||||
@@ -766,14 +622,13 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
return _column_def.type->deserialize(cell_value_bv);
|
||||
});
|
||||
for (auto&& value : _values) {
|
||||
auto fragmented_val = value->bind_and_get(options);
|
||||
if (!fragmented_val) {
|
||||
auto val = value->bind_and_get(options);
|
||||
if (!val) {
|
||||
continue;
|
||||
}
|
||||
return with_linearized(*fragmented_val, [&] (bytes_view val) {
|
||||
auto exists_in = [&](auto&& range) {
|
||||
auto found = std::find_if(range.begin(), range.end(), [&] (auto&& element) {
|
||||
return element_type->compare(element.serialize(), val) == 0;
|
||||
return element_type->compare(element.serialize(), *val) == 0;
|
||||
});
|
||||
return found != range.end();
|
||||
};
|
||||
@@ -791,8 +646,6 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
if (col_type->is_map()) {
|
||||
auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
|
||||
@@ -801,10 +654,8 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
if (!k) {
|
||||
continue;
|
||||
}
|
||||
auto found = with_linearized(*k, [&] (bytes_view k_bv) {
|
||||
return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first.serialize(), k_bv) == 0;
|
||||
});
|
||||
auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first.serialize(), *k) == 0;
|
||||
});
|
||||
if (found == data_map.end()) {
|
||||
return false;
|
||||
@@ -816,15 +667,10 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
if (!map_key || !map_value) {
|
||||
continue;
|
||||
}
|
||||
auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
|
||||
return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first.serialize(), map_key_bv) == 0;
|
||||
});
|
||||
auto found = std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
|
||||
return map_key_type->compare(element.first.serialize(), *map_key) == 0;
|
||||
});
|
||||
if (found == data_map.end()
|
||||
|| with_linearized(*map_value, [&] (bytes_view map_value_bv) {
|
||||
return element_type->compare(found->second.serialize(), map_value_bv);
|
||||
}) != 0) {
|
||||
if (found == data_map.end() || element_type->compare(found->second.serialize(), *map_value) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -834,11 +680,6 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool single_column_restriction::contains::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
//TODO(sarna): Deserialize & return. It would be nice to deduplicate, is_satisfied_by above is rather long
|
||||
fail(unimplemented::cause::INDEXES);
|
||||
}
|
||||
|
||||
bool token_restriction::EQ::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
|
||||
@@ -120,8 +120,8 @@ public:
|
||||
bool for_view = false,
|
||||
bool allow_filtering = false);
|
||||
private:
|
||||
void add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering);
|
||||
void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering);
|
||||
void add_restriction(::shared_ptr<restriction> restriction);
|
||||
void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction);
|
||||
public:
|
||||
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
||||
|
||||
@@ -163,20 +163,6 @@ public:
|
||||
return _clustering_columns_restrictions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a possibly empty collection of column definitions that will be used for filtering
|
||||
* @param db - the database context
|
||||
* @return A list with the column definitions needed for filtering.
|
||||
*/
|
||||
std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
|
||||
|
||||
/**
|
||||
* Determines the index to be used with the restriction.
|
||||
* @param db - the database context (for extracting index manager)
|
||||
* @return If an index can be used, an optional containing this index, otherwise an empty optional.
|
||||
*/
|
||||
std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
|
||||
|
||||
/**
|
||||
* Checks if the partition key has some unrestricted components.
|
||||
* @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
|
||||
@@ -189,7 +175,7 @@ public:
|
||||
*/
|
||||
bool has_unrestricted_clustering_columns() const;
|
||||
private:
|
||||
void process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering);
|
||||
void process_partition_key_restrictions(bool has_queriable_index, bool for_view);
|
||||
|
||||
/**
|
||||
* Returns the partition key components that are not restricted.
|
||||
@@ -204,7 +190,7 @@ private:
|
||||
* @param select_a_collection <code>true</code> if the query should return a collection column
|
||||
* @throws InvalidRequestException if the request is invalid
|
||||
*/
|
||||
void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering);
|
||||
void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view);
|
||||
|
||||
/**
|
||||
* Returns the <code>Restrictions</code> for the specified type of columns.
|
||||
@@ -372,7 +358,7 @@ public:
|
||||
* Checks if the query need to use filtering.
|
||||
* @return <code>true</code> if the query need to use filtering, <code>false</code> otherwise.
|
||||
*/
|
||||
bool need_filtering() const;
|
||||
bool need_filtering();
|
||||
|
||||
void validate_secondary_index_selections(bool selects_only_static_columns);
|
||||
|
||||
@@ -395,14 +381,6 @@ public:
|
||||
return !_nonprimary_key_restrictions->empty();
|
||||
}
|
||||
|
||||
bool pk_restrictions_need_filtering() const {
|
||||
return _partition_key_restrictions->needs_filtering(*_schema);
|
||||
}
|
||||
|
||||
bool ck_restrictions_need_filtering() const {
|
||||
return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if column is restricted by some restriction, false otherwise
|
||||
*/
|
||||
@@ -421,16 +399,6 @@ public:
|
||||
const single_column_restrictions::restrictions_map& get_non_pk_restriction() const {
|
||||
return _nonprimary_key_restrictions->restrictions();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return partition key restrictions split into single column restrictions (e.g. for filtering support).
|
||||
*/
|
||||
const single_column_restrictions::restrictions_map& get_single_column_partition_key_restrictions() const;
|
||||
|
||||
/**
|
||||
* @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
|
||||
*/
|
||||
const single_column_restrictions::restrictions_map& get_single_column_clustering_key_restrictions() const;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -45,25 +45,27 @@ namespace cql3 {
|
||||
|
||||
metadata::metadata(std::vector<::shared_ptr<column_specification>> names_)
|
||||
: _flags(flag_enum_set())
|
||||
, _column_info(make_lw_shared<column_info>(std::move(names_)))
|
||||
{ }
|
||||
, names(std::move(names_)) {
|
||||
_column_count = names.size();
|
||||
}
|
||||
|
||||
metadata::metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state)
|
||||
: _flags(flags)
|
||||
, _column_info(make_lw_shared<column_info>(std::move(names_), column_count))
|
||||
, names(std::move(names_))
|
||||
, _column_count(column_count)
|
||||
, _paging_state(std::move(paging_state))
|
||||
{ }
|
||||
|
||||
// The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
|
||||
uint32_t metadata::value_count() const {
|
||||
return _flags.contains<flag::NO_METADATA>() ? _column_info->_column_count : _column_info->_names.size();
|
||||
return _flags.contains<flag::NO_METADATA>() ? _column_count : names.size();
|
||||
}
|
||||
|
||||
void metadata::add_non_serialized_column(::shared_ptr<column_specification> name) {
|
||||
// See comment above. Because columnCount doesn't account the newly added name, it
|
||||
// won't be serialized.
|
||||
_column_info->_names.emplace_back(std::move(name));
|
||||
names.emplace_back(std::move(name));
|
||||
}
|
||||
|
||||
bool metadata::all_in_same_cf() const {
|
||||
@@ -71,24 +73,18 @@ bool metadata::all_in_same_cf() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
return column_specification::all_in_same_table(_column_info->_names);
|
||||
return column_specification::all_in_same_table(names);
|
||||
}
|
||||
|
||||
void metadata::set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
void metadata::set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
if (!paging_state) {
|
||||
return;
|
||||
}
|
||||
|
||||
_flags.set<flag::HAS_MORE_PAGES>();
|
||||
_paging_state = std::move(paging_state);
|
||||
}
|
||||
|
||||
void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
assert(paging_state);
|
||||
if (paging_state->get_remaining() > 0) {
|
||||
set_paging_state(std::move(paging_state));
|
||||
} else {
|
||||
_flags.remove<flag::HAS_MORE_PAGES>();
|
||||
_paging_state = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void metadata::set_skip_metadata() {
|
||||
_flags.set<flag::NO_METADATA>();
|
||||
}
|
||||
@@ -97,10 +93,18 @@ metadata::flag_enum_set metadata::flags() const {
|
||||
return _flags;
|
||||
}
|
||||
|
||||
uint32_t metadata::column_count() const {
|
||||
return _column_count;
|
||||
}
|
||||
|
||||
::shared_ptr<const service::pager::paging_state> metadata::paging_state() const {
|
||||
return _paging_state;
|
||||
}
|
||||
|
||||
const std::vector<::shared_ptr<column_specification>>& metadata::get_names() const {
|
||||
return names;
|
||||
}
|
||||
|
||||
prepared_metadata::prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
|
||||
const std::vector<uint16_t>& partition_key_bind_indices)
|
||||
: _names{names}
|
||||
|
||||
@@ -70,29 +70,18 @@ public:
|
||||
|
||||
using flag_enum_set = enum_set<flag_enum>;
|
||||
|
||||
struct column_info {
|
||||
private:
|
||||
flag_enum_set _flags;
|
||||
|
||||
public:
|
||||
// Please note that columnCount can actually be smaller than names, even if names is not null. This is
|
||||
// used to include columns in the resultSet that we need to do post-query re-orderings
|
||||
// (SelectStatement.orderResults) but that shouldn't be sent to the user as they haven't been requested
|
||||
// (CASSANDRA-4911). So the serialization code will exclude any columns in name whose index is >= columnCount.
|
||||
std::vector<::shared_ptr<column_specification>> _names;
|
||||
uint32_t _column_count;
|
||||
|
||||
column_info(std::vector<::shared_ptr<column_specification>> names, uint32_t column_count)
|
||||
: _names(std::move(names))
|
||||
, _column_count(column_count)
|
||||
{ }
|
||||
|
||||
explicit column_info(std::vector<::shared_ptr<column_specification>> names)
|
||||
: _names(std::move(names))
|
||||
, _column_count(_names.size())
|
||||
{ }
|
||||
};
|
||||
private:
|
||||
flag_enum_set _flags;
|
||||
std::vector<::shared_ptr<column_specification>> names;
|
||||
|
||||
private:
|
||||
lw_shared_ptr<column_info> _column_info;
|
||||
uint32_t _column_count;
|
||||
::shared_ptr<const service::pager::paging_state> _paging_state;
|
||||
|
||||
public:
|
||||
@@ -110,20 +99,17 @@ private:
|
||||
bool all_in_same_cf() const;
|
||||
|
||||
public:
|
||||
void set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
void maybe_set_paging_state(::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
void set_has_more_pages(::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
void set_skip_metadata();
|
||||
|
||||
flag_enum_set flags() const;
|
||||
|
||||
uint32_t column_count() const { return _column_info->_column_count; }
|
||||
uint32_t column_count() const;
|
||||
|
||||
::shared_ptr<const service::pager::paging_state> paging_state() const;
|
||||
|
||||
const std::vector<::shared_ptr<column_specification>>& get_names() const {
|
||||
return _column_info->_names;
|
||||
}
|
||||
const std::vector<::shared_ptr<column_specification>>& get_names() const;
|
||||
};
|
||||
|
||||
::shared_ptr<const cql3::metadata> make_empty_metadata();
|
||||
@@ -237,14 +223,14 @@ public:
|
||||
class result {
|
||||
std::unique_ptr<cql3::result_set> _result_set;
|
||||
result_generator _result_generator;
|
||||
shared_ptr<const cql3::metadata> _metadata;
|
||||
shared_ptr<cql3::metadata> _metadata;
|
||||
public:
|
||||
explicit result(std::unique_ptr<cql3::result_set> rs)
|
||||
: _result_set(std::move(rs))
|
||||
, _metadata(_result_set->_metadata)
|
||||
{ }
|
||||
|
||||
explicit result(result_generator generator, shared_ptr<const metadata> m)
|
||||
explicit result(result_generator generator, shared_ptr<metadata> m)
|
||||
: _result_generator(std::move(generator))
|
||||
, _metadata(std::move(m))
|
||||
{ }
|
||||
@@ -254,7 +240,7 @@ public:
|
||||
if (_result_set) {
|
||||
return *_result_set;
|
||||
} else {
|
||||
auto builder = result_set::builder(make_shared<cql3::metadata>(*_metadata));
|
||||
auto builder = result_set::builder(_metadata);
|
||||
_result_generator.visit(builder);
|
||||
return std::move(builder).get_result_set();
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
|
||||
selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
|
||||
auto&& factory = _selected->new_selector_factory(db, s, defs);
|
||||
auto&& type = factory->new_instance()->get_type();
|
||||
auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
|
||||
auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
|
||||
if (!ut) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
sprint("Invalid field selection: %s of type %s is not a user type",
|
||||
|
||||
@@ -40,7 +40,6 @@
|
||||
*/
|
||||
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <boost/range/adaptor/filtered.hpp>
|
||||
|
||||
#include "cql3/selection/selection.hh"
|
||||
#include "cql3/selection/selector_factories.hh"
|
||||
@@ -156,9 +155,9 @@ public:
|
||||
return _factories->uses_function(ks_name, function_name);
|
||||
}
|
||||
|
||||
virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
|
||||
uint32_t index = selection::add_column_for_post_processing(c);
|
||||
_factories->add_selector_for_post_processing(c, index);
|
||||
virtual uint32_t add_column_for_ordering(const column_definition& c) override {
|
||||
uint32_t index = selection::add_column_for_ordering(c);
|
||||
_factories->add_selector_for_ordering(c, index);
|
||||
return index;
|
||||
}
|
||||
|
||||
@@ -209,17 +208,9 @@ protected:
|
||||
|
||||
::shared_ptr<selection> selection::wildcard(schema_ptr schema) {
|
||||
auto columns = schema->all_columns_in_select_order();
|
||||
// filter out hidden columns, which should not be seen by the
|
||||
// user when doing "SELECT *". We also disallow selecting them
|
||||
// individually (see column_identifier::new_selector_factory()).
|
||||
auto cds = boost::copy_range<std::vector<const column_definition*>>(
|
||||
columns |
|
||||
boost::adaptors::filtered([](const column_definition& c) {
|
||||
return !c.is_view_virtual();
|
||||
}) |
|
||||
boost::adaptors::transformed([](const column_definition& c) {
|
||||
return &c;
|
||||
}));
|
||||
auto cds = boost::copy_range<std::vector<const column_definition*>>(columns | boost::adaptors::transformed([](const column_definition& c) {
|
||||
return &c;
|
||||
}));
|
||||
return simple_selection::make(schema, std::move(cds), true);
|
||||
}
|
||||
|
||||
@@ -227,7 +218,7 @@ protected:
|
||||
return simple_selection::make(schema, std::move(columns), false);
|
||||
}
|
||||
|
||||
uint32_t selection::add_column_for_post_processing(const column_definition& c) {
|
||||
uint32_t selection::add_column_for_ordering(const column_definition& c) {
|
||||
_columns.push_back(&c);
|
||||
_metadata->add_non_serialized_column(c.column_specification);
|
||||
return _columns.size() - 1;
|
||||
@@ -339,106 +330,93 @@ std::unique_ptr<result_set> result_set_builder::build() {
|
||||
return std::move(_result_set);
|
||||
}
|
||||
|
||||
bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) const {
|
||||
static logging::logger rlogger("restrictions_filter");
|
||||
|
||||
if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
auto row_iterator = row.iterator();
|
||||
auto non_pk_restrictions_map = _restrictions->get_non_pk_restriction();
|
||||
auto partition_key_restrictions_map = _restrictions->get_single_column_partition_key_restrictions();
|
||||
auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
|
||||
for (auto&& cdef : selection.get_columns()) {
|
||||
switch (cdef->kind) {
|
||||
case column_kind::static_column:
|
||||
// fallthrough
|
||||
case column_kind::regular_column: {
|
||||
auto& cell_iterator = (cdef->kind == column_kind::static_column) ? static_row_iterator : row_iterator;
|
||||
if (cdef->type->is_multi_cell()) {
|
||||
cell_iterator.next_collection_cell();
|
||||
auto restr_it = non_pk_restrictions_map.find(cdef);
|
||||
if (restr_it == non_pk_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
throw exceptions::invalid_request_exception("Collection filtering is not supported yet");
|
||||
} else {
|
||||
auto cell = cell_iterator.next_atomic_cell();
|
||||
|
||||
auto restr_it = non_pk_restrictions_map.find(cdef);
|
||||
if (restr_it == non_pk_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
restrictions::single_column_restriction& restriction = *restr_it->second;
|
||||
|
||||
bool regular_restriction_matches;
|
||||
if (cell) {
|
||||
regular_restriction_matches = cell->value().with_linearized([&restriction, this](bytes_view data) {
|
||||
return restriction.is_satisfied_by(data, _options);
|
||||
});
|
||||
} else {
|
||||
regular_restriction_matches = restriction.is_satisfied_by(bytes(), _options);
|
||||
}
|
||||
if (!regular_restriction_matches) {
|
||||
_current_static_row_does_not_match = (cdef->kind == column_kind::static_column);
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
break;
|
||||
case column_kind::partition_key: {
|
||||
auto restr_it = partition_key_restrictions_map.find(cdef);
|
||||
if (restr_it == partition_key_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
restrictions::single_column_restriction& restriction = *restr_it->second;
|
||||
const bytes& value_to_check = partition_key[cdef->id];
|
||||
bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, _options);
|
||||
if (!pk_restriction_matches) {
|
||||
_current_partition_key_does_not_match = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case column_kind::clustering_key: {
|
||||
auto restr_it = clustering_key_restrictions_map.find(cdef);
|
||||
if (restr_it == clustering_key_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
restrictions::single_column_restriction& restriction = *restr_it->second;
|
||||
const bytes& value_to_check = clustering_key[cdef->id];
|
||||
bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, _options);
|
||||
if (!pk_restriction_matches) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
result_set_builder::visitor::visitor(
|
||||
cql3::selection::result_set_builder& builder, const schema& s,
|
||||
const selection& selection)
|
||||
: _builder(builder), _schema(s), _selection(selection), _row_count(0) {
|
||||
}
|
||||
|
||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) const {
|
||||
const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
|
||||
if (!accepted) {
|
||||
++_rows_dropped;
|
||||
} else if (_remaining > 0) {
|
||||
--_remaining;
|
||||
void result_set_builder::visitor::add_value(const column_definition& def,
|
||||
query::result_row_view::iterator_type& i) {
|
||||
if (def.type->is_multi_cell()) {
|
||||
auto cell = i.next_collection_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add_collection(def, cell->linearize());
|
||||
} else {
|
||||
auto cell = i.next_atomic_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add(def, *cell);
|
||||
}
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_partition(const partition_key& key,
|
||||
uint32_t row_count) {
|
||||
_partition_key = key.explode(_schema);
|
||||
_row_count = row_count;
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_partition(uint32_t row_count) {
|
||||
_row_count = row_count;
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_row(const clustering_key& key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) {
|
||||
_clustering_key = key.explode(_schema);
|
||||
accept_new_row(static_row, row);
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_row(
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) {
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
auto row_iterator = row.iterator();
|
||||
_builder.new_row();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
switch (def->kind) {
|
||||
case column_kind::partition_key:
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
break;
|
||||
case column_kind::clustering_key:
|
||||
if (_clustering_key.size() > def->component_index()) {
|
||||
_builder.add(_clustering_key[def->component_index()]);
|
||||
} else {
|
||||
_builder.add({});
|
||||
}
|
||||
break;
|
||||
case column_kind::regular_column:
|
||||
add_value(*def, row_iterator);
|
||||
break;
|
||||
case column_kind::static_column:
|
||||
add_value(*def, static_row_iterator);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_partition_end(
|
||||
const query::result_row_view& static_row) {
|
||||
if (_row_count == 0) {
|
||||
_builder.new_row();
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
if (def->is_partition_key()) {
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
} else if (def->is_static()) {
|
||||
add_value(*def, static_row_iterator);
|
||||
} else {
|
||||
_builder.add_empty();
|
||||
}
|
||||
}
|
||||
}
|
||||
return accepted;
|
||||
}
|
||||
|
||||
api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
|
||||
|
||||
@@ -48,7 +48,6 @@
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "cql3/selection/raw_selector.hh"
|
||||
#include "cql3/selection/selector_factories.hh"
|
||||
#include "cql3/restrictions/statement_restrictions.hh"
|
||||
#include "unimplemented.hh"
|
||||
|
||||
namespace cql3 {
|
||||
@@ -169,14 +168,10 @@ public:
|
||||
return _metadata;
|
||||
}
|
||||
|
||||
::shared_ptr<metadata> get_result_metadata() {
|
||||
return _metadata;
|
||||
}
|
||||
|
||||
static ::shared_ptr<selection> wildcard(schema_ptr schema);
|
||||
static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);
|
||||
|
||||
virtual uint32_t add_column_for_post_processing(const column_definition& c);
|
||||
virtual uint32_t add_column_for_ordering(const column_definition& c);
|
||||
|
||||
virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
|
||||
return false;
|
||||
@@ -252,40 +247,6 @@ private:
|
||||
const gc_clock::time_point _now;
|
||||
cql_serialization_format _cql_serialization_format;
|
||||
public:
|
||||
class nop_filter {
|
||||
public:
|
||||
inline bool operator()(const selection&, const std::vector<bytes>&, const std::vector<bytes>&, const query::result_row_view&, const query::result_row_view&) const {
|
||||
return true;
|
||||
}
|
||||
void reset() {
|
||||
}
|
||||
uint32_t get_rows_dropped() const {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
class restrictions_filter {
|
||||
::shared_ptr<restrictions::statement_restrictions> _restrictions;
|
||||
const query_options& _options;
|
||||
mutable bool _current_partition_key_does_not_match = false;
|
||||
mutable bool _current_static_row_does_not_match = false;
|
||||
mutable uint32_t _rows_dropped = 0;
|
||||
mutable uint32_t _remaining = 0;
|
||||
public:
|
||||
restrictions_filter() = default;
|
||||
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
|
||||
bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||
void reset() {
|
||||
_current_partition_key_does_not_match = false;
|
||||
_current_static_row_does_not_match = false;
|
||||
_rows_dropped = 0;
|
||||
}
|
||||
uint32_t get_rows_dropped() const {
|
||||
return _rows_dropped;
|
||||
}
|
||||
private:
|
||||
bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||
};
|
||||
|
||||
result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
|
||||
void add_empty();
|
||||
void add(bytes_opt value);
|
||||
@@ -295,9 +256,8 @@ public:
|
||||
std::unique_ptr<result_set> build();
|
||||
api::timestamp_type timestamp_of(size_t idx);
|
||||
int32_t ttl_of(size_t idx);
|
||||
|
||||
|
||||
// Implements ResultVisitor concept from query.hh
|
||||
template<typename Filter = nop_filter>
|
||||
class visitor {
|
||||
protected:
|
||||
result_set_builder& _builder;
|
||||
@@ -306,101 +266,20 @@ public:
|
||||
uint32_t _row_count;
|
||||
std::vector<bytes> _partition_key;
|
||||
std::vector<bytes> _clustering_key;
|
||||
Filter _filter;
|
||||
public:
|
||||
visitor(cql3::selection::result_set_builder& builder, const schema& s,
|
||||
const selection& selection, Filter filter = Filter())
|
||||
: _builder(builder)
|
||||
, _schema(s)
|
||||
, _selection(selection)
|
||||
, _row_count(0)
|
||||
, _filter(filter)
|
||||
{}
|
||||
visitor(cql3::selection::result_set_builder& builder, const schema& s, const selection&);
|
||||
visitor(visitor&&) = default;
|
||||
|
||||
void add_value(const column_definition& def, query::result_row_view::iterator_type& i) {
|
||||
if (def.type->is_multi_cell()) {
|
||||
auto cell = i.next_collection_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add_collection(def, cell->linearize());
|
||||
} else {
|
||||
auto cell = i.next_atomic_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add(def, *cell);
|
||||
}
|
||||
}
|
||||
|
||||
void accept_new_partition(const partition_key& key, uint32_t row_count) {
|
||||
_partition_key = key.explode(_schema);
|
||||
_row_count = row_count;
|
||||
_filter.reset();
|
||||
}
|
||||
|
||||
void accept_new_partition(uint32_t row_count) {
|
||||
_row_count = row_count;
|
||||
_filter.reset();
|
||||
}
|
||||
|
||||
void accept_new_row(const clustering_key& key, const query::result_row_view& static_row, const query::result_row_view& row) {
|
||||
_clustering_key = key.explode(_schema);
|
||||
accept_new_row(static_row, row);
|
||||
}
|
||||
|
||||
void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
auto row_iterator = row.iterator();
|
||||
if (!_filter(_selection, _partition_key, _clustering_key, static_row, row)) {
|
||||
return;
|
||||
}
|
||||
_builder.new_row();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
switch (def->kind) {
|
||||
case column_kind::partition_key:
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
break;
|
||||
case column_kind::clustering_key:
|
||||
if (_clustering_key.size() > def->component_index()) {
|
||||
_builder.add(_clustering_key[def->component_index()]);
|
||||
} else {
|
||||
_builder.add({});
|
||||
}
|
||||
break;
|
||||
case column_kind::regular_column:
|
||||
add_value(*def, row_iterator);
|
||||
break;
|
||||
case column_kind::static_column:
|
||||
add_value(*def, static_row_iterator);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t accept_partition_end(const query::result_row_view& static_row) {
|
||||
if (_row_count == 0) {
|
||||
_builder.new_row();
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
if (def->is_partition_key()) {
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
} else if (def->is_static()) {
|
||||
add_value(*def, static_row_iterator);
|
||||
} else {
|
||||
_builder.add_empty();
|
||||
}
|
||||
}
|
||||
}
|
||||
return _filter.get_rows_dropped();
|
||||
}
|
||||
void add_value(const column_definition& def, query::result_row_view::iterator_type& i);
|
||||
void accept_new_partition(const partition_key& key, uint32_t row_count);
|
||||
void accept_new_partition(uint32_t row_count);
|
||||
void accept_new_row(const clustering_key& key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row);
|
||||
void accept_new_row(const query::result_row_view& static_row,
|
||||
const query::result_row_view& row);
|
||||
void accept_partition_end(const query::result_row_view& static_row);
|
||||
};
|
||||
|
||||
private:
|
||||
bytes_opt get_value(data_type t, query::result_atomic_cell_view c);
|
||||
};
|
||||
|
||||
@@ -53,7 +53,6 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
|
||||
: _contains_write_time_factory(false)
|
||||
, _contains_ttl_factory(false)
|
||||
, _number_of_aggregate_factories(0)
|
||||
, _number_of_factories_for_post_processing(0)
|
||||
{
|
||||
_factories.reserve(selectables.size());
|
||||
|
||||
@@ -77,9 +76,8 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
|
||||
return false;
|
||||
}
|
||||
|
||||
void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
|
||||
void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
|
||||
_factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
|
||||
++_number_of_factories_for_post_processing;
|
||||
}
|
||||
|
||||
std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
|
||||
|
||||
@@ -74,11 +74,6 @@ private:
|
||||
*/
|
||||
uint32_t _number_of_aggregate_factories;
|
||||
|
||||
/**
|
||||
* The number of factories that are only for post processing.
|
||||
*/
|
||||
uint32_t _number_of_factories_for_post_processing;
|
||||
|
||||
public:
|
||||
/**
|
||||
* Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
|
||||
@@ -102,12 +97,11 @@ public:
|
||||
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
||||
|
||||
/**
|
||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
|
||||
* processing purposes.
|
||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
|
||||
* @param def the column that is needed for ordering
|
||||
* @param index the index of the column definition in the Selection's list of columns
|
||||
*/
|
||||
void add_selector_for_post_processing(const column_definition& def, uint32_t index);
|
||||
void add_selector_for_ordering(const column_definition& def, uint32_t index);
|
||||
|
||||
/**
|
||||
* Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
|
||||
@@ -117,7 +111,7 @@ public:
|
||||
*/
|
||||
bool contains_only_aggregate_functions() const {
|
||||
auto size = _factories.size();
|
||||
return size != 0 && _number_of_aggregate_factories == (size - _number_of_factories_for_post_processing);
|
||||
return size != 0 && _number_of_aggregate_factories == size;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
12
cql3/sets.cc
12
cql3/sets.cc
@@ -120,19 +120,17 @@ sets::literal::to_string() const {
|
||||
}
|
||||
|
||||
sets::value
|
||||
sets::value::from_serialized(const fragmented_temporary_buffer::view& val, set_type type, cql_serialization_format sf) {
|
||||
sets::value::from_serialized(bytes_view v, set_type type, cql_serialization_format sf) {
|
||||
try {
|
||||
// Collections have this small hack that validate cannot be called on a serialized object,
|
||||
// but compose does the validation (so we're fine).
|
||||
// FIXME: deserializeForNativeProtocol?!
|
||||
return with_linearized(val, [&] (bytes_view v) {
|
||||
auto s = value_cast<set_type_impl::native_type>(type->deserialize(v, sf));
|
||||
std::set<bytes, serialized_compare> elements(type->get_elements_type()->as_less_comparator());
|
||||
for (auto&& element : s) {
|
||||
elements.insert(elements.end(), type->get_elements_type()->decompose(element));
|
||||
}
|
||||
return value(std::move(elements));
|
||||
});
|
||||
} catch (marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception(e.what());
|
||||
}
|
||||
@@ -200,10 +198,10 @@ sets::delayed_value::bind(const query_options& options) {
|
||||
return constants::UNSET_VALUE;
|
||||
}
|
||||
// We don't support value > 64K because the serialization format encode the length as an unsigned short.
|
||||
if (b->size_bytes() > std::numeric_limits<uint16_t>::max()) {
|
||||
if (b->size() > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(sprint("Set value is too long. Set values are limited to %d bytes but %d bytes value provided",
|
||||
std::numeric_limits<uint16_t>::max(),
|
||||
b->size_bytes()));
|
||||
b->size()));
|
||||
}
|
||||
|
||||
buffers.insert(buffers.end(), std::move(to_bytes(*b)));
|
||||
@@ -271,7 +269,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
|
||||
}
|
||||
|
||||
for (auto&& e : set_value->_elements) {
|
||||
mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), bytes_view(), atomic_cell::collection_member::yes));
|
||||
mut.cells.emplace_back(e, params.make_cell(*set_type->value_comparator(), {}, atomic_cell::collection_member::yes));
|
||||
}
|
||||
auto smut = set_type->serialize_mutation_form(mut);
|
||||
|
||||
@@ -281,7 +279,7 @@ sets::adder::do_add(mutation& m, const clustering_key_prefix& row_key, const upd
|
||||
auto v = set_type->serialize_partially_deserialized_form(
|
||||
{set_value->_elements.begin(), set_value->_elements.end()},
|
||||
cql_serialization_format::internal());
|
||||
m.set_cell(row_key, column, params.make_cell(*column.type, fragmented_temporary_buffer::view(v)));
|
||||
m.set_cell(row_key, column, params.make_cell(*column.type, std::move(v)));
|
||||
} else {
|
||||
m.set_cell(row_key, column, params.make_dead_cell());
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ public:
|
||||
value(std::set<bytes, serialized_compare> elements)
|
||||
: _elements(std::move(elements)) {
|
||||
}
|
||||
static value from_serialized(const fragmented_temporary_buffer::view& v, set_type type, cql_serialization_format sf);
|
||||
static value from_serialized(bytes_view v, set_type type, cql_serialization_format sf);
|
||||
virtual cql3::raw_value get(const query_options& options) override;
|
||||
virtual bytes get_with_protocol_version(cql_serialization_format sf) override;
|
||||
bool equals(set_type st, const value& v);
|
||||
|
||||
@@ -101,6 +101,13 @@ single_column_relation::to_receivers(schema_ptr schema, const column_definition&
|
||||
}
|
||||
|
||||
if (is_IN()) {
|
||||
// For partition keys we only support IN for the last name so far
|
||||
if (column_def.is_partition_key() && !schema->is_last_partition_key(column_def)) {
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"Partition KEY part %s cannot be restricted by IN relation (only the last part of the partition key can)",
|
||||
column_def.name_as_text()));
|
||||
}
|
||||
|
||||
// We only allow IN on the row key and the clustering key so far, never on non-PK columns, and this even if
|
||||
// there's an index
|
||||
// Note: for backward compatibility reason, we conside a IN of 1 value the same as a EQ, so we let that
|
||||
|
||||
@@ -246,22 +246,18 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
|
||||
cfm.with_column(column_name->name(), type, _is_static ? column_kind::static_column : column_kind::regular_column);
|
||||
|
||||
// Adding a column to a base table always requires updating the view
|
||||
// schemas: If the view includes all columns it should include the new
|
||||
// column, but if it doesn't, it may need to include the new
|
||||
// unselected column as a virtual column. The case when it we
|
||||
// shouldn't add a virtual column is when the view has in its PK one
|
||||
// of the base's regular columns - but even in this case we need to
|
||||
// rebuild the view schema, to update the column ID.
|
||||
// Adding a column to a table which has an include all view requires the column to be added to the view
|
||||
// as well. If the view has a regular base column in its PK, then the column ID needs to be updated in
|
||||
// view_info; for that, rebuild the schema.
|
||||
if (!_is_static) {
|
||||
for (auto&& view : cf.views()) {
|
||||
schema_builder builder(view);
|
||||
if (view->view_info()->include_all_columns()) {
|
||||
builder.with_column(column_name->name(), type);
|
||||
} else if (!view->view_info()->base_non_pk_column_in_view_pk()) {
|
||||
db::view::create_virtual_column(builder, column_name->name(), type);
|
||||
if (view->view_info()->include_all_columns() || view->view_info()->base_non_pk_column_in_view_pk()) {
|
||||
schema_builder builder(view);
|
||||
if (view->view_info()->include_all_columns()) {
|
||||
builder.with_column(column_name->name(), type);
|
||||
}
|
||||
view_updates.push_back(view_ptr(builder.build()));
|
||||
}
|
||||
view_updates.push_back(view_ptr(builder.build()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +272,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
|
||||
auto type = validate_alter(schema, *def, *validator);
|
||||
// In any case, we update the column definition
|
||||
cfm.alter_column_type(column_name->name(), type);
|
||||
cfm.with_altered_column_type(column_name->name(), type);
|
||||
|
||||
// We also have to validate the view types here. If we have a view which includes a column as part of
|
||||
// the clustering key, we need to make sure that it is indeed compatible.
|
||||
@@ -285,7 +281,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
if (view_def) {
|
||||
schema_builder builder(view);
|
||||
auto view_type = validate_alter(view, *view_def, *validator);
|
||||
builder.alter_column_type(column_name->name(), std::move(view_type));
|
||||
builder.with_altered_column_type(column_name->name(), std::move(view_type));
|
||||
view_updates.push_back(view_ptr(builder.build()));
|
||||
}
|
||||
}
|
||||
@@ -306,7 +302,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
} else {
|
||||
for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
|
||||
if (column_def.name() == column_name->name()) {
|
||||
cfm.remove_column(column_name->name());
|
||||
cfm.without_column(column_name->name());
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -349,10 +345,9 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
auto to = entry.second->prepare_column_identifier(schema);
|
||||
|
||||
validate_column_rename(db, *schema, *from, *to);
|
||||
cfm.rename_column(from->name(), to->name());
|
||||
cfm.with_column_rename(from->name(), to->name());
|
||||
|
||||
// If the view includes a renamed column, it must be renamed in
|
||||
// the view table and the definition.
|
||||
// If the view includes a renamed column, it must be renamed in the view table and the definition.
|
||||
for (auto&& view : cf.views()) {
|
||||
if (view->get_column_definition(from->name())) {
|
||||
schema_builder builder(view);
|
||||
@@ -360,7 +355,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
auto view_from = entry.first->prepare_column_identifier(view);
|
||||
auto view_to = entry.second->prepare_column_identifier(view);
|
||||
validate_column_rename(db, *view, *view_from, *view_to);
|
||||
builder.rename_column(view_from->name(), view_to->name());
|
||||
builder.with_column_rename(view_from->name(), view_to->name());
|
||||
|
||||
auto new_where = util::rename_column_in_where_clause(
|
||||
view->view_info()->where_clause(),
|
||||
|
||||
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
|
||||
if (t_opt) {
|
||||
modified = true;
|
||||
// We need to update this column
|
||||
cfm.alter_column_type(column.name(), *t_opt);
|
||||
cfm.with_altered_column_type(column.name(), *t_opt);
|
||||
}
|
||||
}
|
||||
if (modified) {
|
||||
@@ -165,7 +165,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
|
||||
user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
|
||||
{
|
||||
if (get_idx_of_field(to_update, _field_name)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->to_string(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
|
||||
}
|
||||
|
||||
std::vector<bytes> new_names(to_update->field_names());
|
||||
@@ -173,7 +173,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
|
||||
std::vector<data_type> new_types(to_update->field_types());
|
||||
auto&& add_type = _field_type->prepare(db, keyspace())->get_type();
|
||||
if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
|
||||
}
|
||||
new_types.push_back(std::move(add_type));
|
||||
return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
|
||||
@@ -183,13 +183,13 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
|
||||
{
|
||||
stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
|
||||
if (!idx) {
|
||||
throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->to_string(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
|
||||
}
|
||||
|
||||
auto previous = to_update->field_types()[*idx];
|
||||
auto new_type = _field_type->prepare(db, keyspace())->get_type();
|
||||
if (!new_type->is_compatible_with(*previous)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->to_string(), _name.to_string()));
|
||||
throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
|
||||
}
|
||||
|
||||
std::vector<data_type> new_types(to_update->field_types());
|
||||
|
||||
@@ -191,20 +191,20 @@ const std::vector<batch_statement::single_statement>& batch_statement::get_state
|
||||
return _statements;
|
||||
}
|
||||
|
||||
future<std::vector<mutation>> batch_statement::get_mutations(service::storage_proxy& storage, const query_options& options, db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
|
||||
future<std::vector<mutation>> batch_statement::get_mutations(service::storage_proxy& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
|
||||
// Do not process in parallel because operations like list append/prepend depend on execution order.
|
||||
using mutation_set_type = std::unordered_set<mutation, mutation_hash_by_key, mutation_equals_by_key>;
|
||||
return do_with(mutation_set_type(), [this, &storage, &options, timeout, now, local, trace_state] (auto& result) {
|
||||
return do_with(mutation_set_type(), [this, &storage, &options, now, local, trace_state] (auto& result) {
|
||||
result.reserve(_statements.size());
|
||||
_stats.statements_in_batches += _statements.size();
|
||||
return do_for_each(boost::make_counting_iterator<size_t>(0),
|
||||
boost::make_counting_iterator<size_t>(_statements.size()),
|
||||
[this, &storage, &options, now, local, &result, timeout, trace_state] (size_t i) {
|
||||
[this, &storage, &options, now, local, &result, trace_state] (size_t i) {
|
||||
auto&& statement = _statements[i].statement;
|
||||
statement->inc_cql_stats();
|
||||
auto&& statement_options = options.for_statement(i);
|
||||
auto timestamp = _attrs->get_timestamp(now, statement_options);
|
||||
return statement->get_mutations(storage, statement_options, timeout, local, timestamp, trace_state).then([&result] (auto&& more) {
|
||||
return statement->get_mutations(storage, statement_options, local, timestamp, trace_state).then([&result] (auto&& more) {
|
||||
for (auto&& m : more) {
|
||||
// We want unordered_set::try_emplace(), but we don't have it
|
||||
auto pos = result.find(m);
|
||||
@@ -293,9 +293,8 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_
|
||||
return execute_with_conditions(storage, options, query_state);
|
||||
}
|
||||
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
return get_mutations(storage, options, timeout, local, now, query_state.get_trace_state()).then([this, &storage, &options, timeout, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
|
||||
return execute_without_conditions(storage, std::move(ms), options.get_consistency(), timeout, std::move(tr_state));
|
||||
return get_mutations(storage, options, local, now, query_state.get_trace_state()).then([this, &storage, &options, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
|
||||
return execute_without_conditions(storage, std::move(ms), options.get_consistency(), std::move(tr_state));
|
||||
}).then([] {
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
|
||||
make_shared<cql_transport::messages::result_message::void_message>());
|
||||
@@ -306,7 +305,6 @@ future<> batch_statement::execute_without_conditions(
|
||||
service::storage_proxy& storage,
|
||||
std::vector<mutation> mutations,
|
||||
db::consistency_level cl,
|
||||
db::timeout_clock::time_point timeout,
|
||||
tracing::trace_state_ptr tr_state)
|
||||
{
|
||||
// FIXME: do we need to do this?
|
||||
@@ -334,7 +332,7 @@ future<> batch_statement::execute_without_conditions(
|
||||
mutate_atomic = false;
|
||||
}
|
||||
}
|
||||
return storage.mutate_with_triggers(std::move(mutations), cl, timeout, mutate_atomic, std::move(tr_state));
|
||||
return storage.mutate_with_triggers(std::move(mutations), cl, mutate_atomic, std::move(tr_state));
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
|
||||
|
||||
@@ -125,7 +125,7 @@ public:
|
||||
|
||||
const std::vector<single_statement>& get_statements();
|
||||
private:
|
||||
future<std::vector<mutation>> get_mutations(service::storage_proxy& storage, const query_options& options, db::timeout_clock::time_point timeout, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state);
|
||||
future<std::vector<mutation>> get_mutations(service::storage_proxy& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state);
|
||||
|
||||
public:
|
||||
/**
|
||||
@@ -147,7 +147,6 @@ private:
|
||||
service::storage_proxy& storage,
|
||||
std::vector<mutation> mutations,
|
||||
db::consistency_level cl,
|
||||
db::timeout_clock::time_point timeout,
|
||||
tracing::trace_state_ptr tr_state);
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>> execute_with_conditions(
|
||||
|
||||
@@ -88,11 +88,6 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
|
||||
}
|
||||
|
||||
if (schema->is_dense()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
|
||||
}
|
||||
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
for (auto& raw_target : _raw_targets) {
|
||||
targets.emplace_back(raw_target->prepare(schema));
|
||||
@@ -114,11 +109,6 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
sprint("No column definition found for column %s", *target->column));
|
||||
}
|
||||
|
||||
//NOTICE(sarna): Should be lifted after resolving issue #2963
|
||||
if (cd->is_static()) {
|
||||
throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
|
||||
}
|
||||
|
||||
if (cd->type->references_duration()) {
|
||||
using request_validations::check_false;
|
||||
const auto& ty = *cd->type;
|
||||
@@ -132,7 +122,8 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
}
|
||||
|
||||
// Origin TODO: we could lift that limitation
|
||||
if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
|
||||
if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
|
||||
cd->kind != column_kind::regular_column) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
|
||||
}
|
||||
@@ -146,15 +137,10 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
|
||||
bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
|
||||
&& dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
|
||||
bool is_collection = cd->type->is_collection();
|
||||
bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();
|
||||
bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
|
||||
|
||||
if (is_frozen_collection) {
|
||||
validate_for_frozen_collection(target);
|
||||
} else if (is_collection) {
|
||||
// NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
|
||||
throw exceptions::invalid_request_exception(
|
||||
sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
|
||||
} else {
|
||||
validate_not_full_index(target);
|
||||
validate_is_values_index_if_target_column_not_collection(cd, target);
|
||||
|
||||
@@ -84,6 +84,7 @@ create_view_statement::create_view_statement(
|
||||
, _clustering_keys{clustering_keys}
|
||||
, _if_not_exists{if_not_exists}
|
||||
{
|
||||
service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
|
||||
if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
|
||||
throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
|
||||
}
|
||||
@@ -274,7 +275,6 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
||||
|
||||
std::vector<const column_definition*> missing_pk_columns;
|
||||
std::vector<const column_definition*> target_non_pk_columns;
|
||||
std::vector<const column_definition*> unselected_columns;
|
||||
|
||||
// We need to include all of the primary key columns from the base table in order to make sure that we do not
|
||||
// overwrite values in the view. We cannot support "collapsing" the base table into a smaller number of rows in
|
||||
@@ -292,9 +292,6 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
||||
if (included_def && !def_in_target_pk) {
|
||||
target_non_pk_columns.push_back(&def);
|
||||
}
|
||||
if (!included_def && !def_in_target_pk && !def.is_static()) {
|
||||
unselected_columns.push_back(&def);
|
||||
}
|
||||
if (def.is_primary_key() && !def_in_target_pk) {
|
||||
missing_pk_columns.push_back(&def);
|
||||
}
|
||||
@@ -314,27 +311,6 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
||||
throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
|
||||
}
|
||||
|
||||
// The unique feature of a filter by a non-key column is that the
|
||||
// value of such column can be updated - and also be expired with TTL
|
||||
// and cause the view row to appear and disappear. We don't currently
|
||||
// support support this case - see issue #3430, and neither does
|
||||
// Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
|
||||
// Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
|
||||
// view row is now depending on multiple base columns (multiple filtered
|
||||
// non-pk base column + base column used in view pk)". When the filtered
|
||||
// column *is* the base column added to the view pk, we don't have this
|
||||
// problem. And this case actually works correctly.
|
||||
auto non_pk_restrictions = restrictions->get_non_pk_restriction();
|
||||
if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
|
||||
std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
|
||||
// This case (filter by new PK column of the view) works, as explained above
|
||||
} else if (!non_pk_restrictions.empty()) {
|
||||
auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
|
||||
column_family(), column_names));
|
||||
}
|
||||
|
||||
schema_builder builder{keyspace(), column_family()};
|
||||
auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
|
||||
for (auto* def : defs) {
|
||||
@@ -345,19 +321,6 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
||||
add_columns(target_partition_keys, column_kind::partition_key);
|
||||
add_columns(target_clustering_keys, column_kind::clustering_key);
|
||||
add_columns(target_non_pk_columns, column_kind::regular_column);
|
||||
// Add all unselected columns (base-table columns which are not selected
|
||||
// in the view) as "virtual columns" - columns which have timestamp and
|
||||
// ttl information, but an empty value. These are needed to keep view
|
||||
// rows alive when the base row is alive, even if the view row has no
|
||||
// data, just a key (see issue #3362). The virtual columns are not needed
|
||||
// when the view pk adds a regular base column (i.e., has_non_pk_column)
|
||||
// because in that case, the liveness of that base column is what
|
||||
// determines the liveness of the view row.
|
||||
if (!has_non_pk_column) {
|
||||
for (auto* def : unselected_columns) {
|
||||
db::view::create_virtual_column(builder, def->name(), def->type);
|
||||
}
|
||||
}
|
||||
_properties.properties()->apply_to_builder(builder, proxy.get_db().local().get_config().extensions());
|
||||
|
||||
if (builder.default_time_to_live().count() > 0) {
|
||||
|
||||
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
|
||||
property_definitions::validate(keywords);
|
||||
|
||||
if (is_custom && !custom_class) {
|
||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
|
||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
|
||||
}
|
||||
|
||||
if (!is_custom && custom_class) {
|
||||
@@ -64,16 +64,6 @@ void cql3::statements::index_prop_defs::validate() {
|
||||
sprint("Cannot specify %s as a CUSTOM option",
|
||||
db::index::secondary_index::custom_index_option_name));
|
||||
}
|
||||
|
||||
// Currently, Scylla does not support *any* class of custom index
|
||||
// implementation. If in the future we do (e.g., SASI, or something
|
||||
// new), we'll need to check for valid values here.
|
||||
if (is_custom && custom_class) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
|
||||
*custom_class));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
index_options_map
|
||||
|
||||
@@ -160,11 +160,11 @@ future<> modification_statement::check_access(const service::client_state& state
|
||||
}
|
||||
|
||||
future<std::vector<mutation>>
|
||||
modification_statement::get_mutations(service::storage_proxy& proxy, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
|
||||
modification_statement::get_mutations(service::storage_proxy& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state) {
|
||||
auto json_cache = maybe_prepare_json_cache(options);
|
||||
auto keys = make_lw_shared(build_partition_keys(options, json_cache));
|
||||
auto ranges = make_lw_shared(create_clustering_ranges(options, json_cache));
|
||||
return make_update_parameters(proxy, keys, ranges, options, timeout, local, now, std::move(trace_state)).then(
|
||||
return make_update_parameters(proxy, keys, ranges, options, local, now, std::move(trace_state)).then(
|
||||
[this, keys, ranges, now, json_cache = std::move(json_cache)] (auto params_ptr) {
|
||||
std::vector<mutation> mutations;
|
||||
mutations.reserve(keys->size());
|
||||
@@ -186,11 +186,10 @@ modification_statement::make_update_parameters(
|
||||
lw_shared_ptr<dht::partition_range_vector> keys,
|
||||
lw_shared_ptr<query::clustering_row_ranges> ranges,
|
||||
const query_options& options,
|
||||
db::timeout_clock::time_point timeout,
|
||||
bool local,
|
||||
int64_t now,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
return read_required_rows(proxy, *keys, std::move(ranges), local, options, timeout, std::move(trace_state)).then(
|
||||
return read_required_rows(proxy, *keys, std::move(ranges), local, options, std::move(trace_state)).then(
|
||||
[this, &options, now] (auto rows) {
|
||||
return make_ready_future<std::unique_ptr<update_parameters>>(
|
||||
std::make_unique<update_parameters>(s, options,
|
||||
@@ -276,7 +275,6 @@ modification_statement::read_required_rows(
|
||||
lw_shared_ptr<query::clustering_row_ranges> ranges,
|
||||
bool local,
|
||||
const query_options& options,
|
||||
db::timeout_clock::time_point timeout,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
if (!requires_read()) {
|
||||
return make_ready_future<update_parameters::prefetched_rows_type>(
|
||||
@@ -310,6 +308,7 @@ modification_statement::read_required_rows(
|
||||
query::partition_slice::option::collections_as_maps>());
|
||||
query::read_command cmd(s->id(), s->version(), ps, std::numeric_limits<uint32_t>::max());
|
||||
// FIXME: ignoring "local"
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
return proxy.query(s, make_lw_shared(std::move(cmd)), std::move(keys),
|
||||
cl, {timeout, std::move(trace_state)}).then([this, ps] (auto qr) {
|
||||
return query::result_view::do_with(*qr.query_result, [&] (query::result_view v) {
|
||||
@@ -409,13 +408,12 @@ modification_statement::execute_without_condition(service::storage_proxy& proxy,
|
||||
db::validate_for_write(s->ks_name(), cl);
|
||||
}
|
||||
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
return get_mutations(proxy, options, timeout, false, options.get_timestamp(qs), qs.get_trace_state()).then([this, cl, timeout, &proxy, &qs] (auto mutations) {
|
||||
return get_mutations(proxy, options, false, options.get_timestamp(qs), qs.get_trace_state()).then([this, cl, &proxy, &qs] (auto mutations) {
|
||||
if (mutations.empty()) {
|
||||
return now();
|
||||
}
|
||||
|
||||
return proxy.mutate_with_triggers(std::move(mutations), cl, timeout, false, qs.get_trace_state(), this->is_raw_counter_shard_write());
|
||||
return proxy.mutate_with_triggers(std::move(mutations), cl, false, qs.get_trace_state(), this->is_raw_counter_shard_write());
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -206,7 +206,6 @@ protected:
|
||||
lw_shared_ptr<query::clustering_row_ranges> ranges,
|
||||
bool local,
|
||||
const query_options& options,
|
||||
db::timeout_clock::time_point now,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
private:
|
||||
future<::shared_ptr<cql_transport::messages::result_message>>
|
||||
@@ -350,7 +349,7 @@ public:
|
||||
* @return vector of the mutations
|
||||
* @throws invalid_request_exception on invalid requests
|
||||
*/
|
||||
future<std::vector<mutation>> get_mutations(service::storage_proxy& proxy, const query_options& options, db::timeout_clock::time_point timeout, bool local, int64_t now, tracing::trace_state_ptr trace_state);
|
||||
future<std::vector<mutation>> get_mutations(service::storage_proxy& proxy, const query_options& options, bool local, int64_t now, tracing::trace_state_ptr trace_state);
|
||||
|
||||
public:
|
||||
future<std::unique_ptr<update_parameters>> make_update_parameters(
|
||||
@@ -358,7 +357,6 @@ public:
|
||||
lw_shared_ptr<dht::partition_range_vector> keys,
|
||||
lw_shared_ptr<query::clustering_row_ranges> ranges,
|
||||
const query_options& options,
|
||||
db::timeout_clock::time_point timeout,
|
||||
bool local,
|
||||
int64_t now,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
|
||||
@@ -87,7 +87,6 @@ private:
|
||||
::shared_ptr<attributes::raw> _attrs;
|
||||
::shared_ptr<term::raw> _json_value;
|
||||
bool _if_not_exists;
|
||||
bool _default_unset;
|
||||
public:
|
||||
/**
|
||||
* A parsed <code>INSERT JSON</code> statement.
|
||||
@@ -96,7 +95,7 @@ public:
|
||||
* @param json_value JSON string representing names and values
|
||||
* @param attrs additional attributes for statement (CL, timestamp, timeToLive)
|
||||
*/
|
||||
insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);
|
||||
insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
|
||||
|
||||
virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
|
||||
::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
|
||||
|
||||
@@ -118,8 +118,7 @@ private:
|
||||
schema_ptr schema,
|
||||
::shared_ptr<variable_specifications> bound_names,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
bool for_view = false,
|
||||
bool allow_filtering = false);
|
||||
bool for_view = false);
|
||||
|
||||
/** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
|
||||
::shared_ptr<term> prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names);
|
||||
@@ -141,10 +140,6 @@ private:
|
||||
/** If ALLOW FILTERING was not specified, this verifies that it is not needed */
|
||||
void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
void ensure_filtering_columns_retrieval(database& db,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
bool contains_alias(::shared_ptr<column_identifier> name);
|
||||
|
||||
::shared_ptr<column_specification> limit_receiver();
|
||||
|
||||
@@ -45,7 +45,6 @@
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "cql3/selection/selection.hh"
|
||||
#include "cql3/util.hh"
|
||||
#include "cql3/restrictions/single_column_primary_key_restrictions.hh"
|
||||
#include "core/shared_ptr.hh"
|
||||
#include "query-result-reader.hh"
|
||||
#include "query_result_merger.hh"
|
||||
@@ -313,14 +312,13 @@ select_statement::make_partition_slice(const query_options& options)
|
||||
if (_is_reversed) {
|
||||
_opts.set(query::partition_slice::option::reversed);
|
||||
std::reverse(bounds.begin(), bounds.end());
|
||||
++_stats.reverse_queries;
|
||||
}
|
||||
return query::partition_slice(std::move(bounds),
|
||||
std::move(static_columns), std::move(regular_columns), _opts, nullptr, options.get_cql_serialization_format());
|
||||
}
|
||||
|
||||
int32_t select_statement::get_limit(const query_options& options) const {
|
||||
if (!_limit || _selection->is_aggregate()) {
|
||||
if (!_limit) {
|
||||
return std::numeric_limits<int32_t>::max();
|
||||
}
|
||||
|
||||
@@ -331,10 +329,9 @@ int32_t select_statement::get_limit(const query_options& options) const {
|
||||
if (val.is_unset_value()) {
|
||||
return std::numeric_limits<int32_t>::max();
|
||||
}
|
||||
return with_linearized(*val, [&] (bytes_view bv) {
|
||||
try {
|
||||
int32_type->validate(bv);
|
||||
auto l = value_cast<int32_t>(int32_type->deserialize(bv));
|
||||
int32_type->validate(*val);
|
||||
auto l = value_cast<int32_t>(int32_type->deserialize(*val));
|
||||
if (l <= 0) {
|
||||
throw exceptions::invalid_request_exception("LIMIT must be strictly positive");
|
||||
}
|
||||
@@ -342,7 +339,6 @@ int32_t select_statement::get_limit(const query_options& options) const {
|
||||
} catch (const marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception("Invalid limit value");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
bool select_statement::needs_post_query_ordering() const {
|
||||
@@ -383,54 +379,45 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
int32_t limit = get_limit(options);
|
||||
auto now = gc_clock::now();
|
||||
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
++_stats.reads;
|
||||
_stats.filtered_reads += restrictions_need_filtering;
|
||||
|
||||
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
|
||||
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
|
||||
|
||||
int32_t page_size = options.get_page_size();
|
||||
|
||||
_stats.unpaged_select_queries += page_size <= 0;
|
||||
|
||||
// An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
|
||||
// If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
|
||||
// Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
|
||||
const bool aggregate = _selection->is_aggregate();
|
||||
const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
|
||||
if (aggregate || nonpaged_filtering) {
|
||||
auto aggregate = _selection->is_aggregate();
|
||||
if (aggregate && page_size <= 0) {
|
||||
page_size = DEFAULT_COUNT_PAGE_SIZE;
|
||||
}
|
||||
|
||||
auto key_ranges = _restrictions->get_partition_key_ranges(options);
|
||||
|
||||
if (!aggregate && !restrictions_need_filtering && (page_size <= 0
|
||||
|| !service::pager::query_pagers::may_need_paging(*_schema, page_size,
|
||||
if (!aggregate && (page_size <= 0
|
||||
|| !service::pager::query_pagers::may_need_paging(page_size,
|
||||
*command, key_ranges))) {
|
||||
return execute(proxy, command, std::move(key_ranges), state, options, now);
|
||||
}
|
||||
|
||||
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto timeout = options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto p = service::pager::query_pagers::pager(_schema, _selection,
|
||||
state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);
|
||||
state, options, timeout, command, std::move(key_ranges));
|
||||
|
||||
if (aggregate || nonpaged_filtering) {
|
||||
if (aggregate) {
|
||||
return do_with(
|
||||
cql3::selection::result_set_builder(*_selection, now,
|
||||
options.get_cql_serialization_format()),
|
||||
[this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
|
||||
[this, p, page_size, now](auto& builder) {
|
||||
return do_until([p] {return p->is_exhausted();},
|
||||
[p, &builder, page_size, now, timeout_duration] {
|
||||
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||
return p->fetch_page(builder, page_size, now, timeout);
|
||||
[p, &builder, page_size, now] {
|
||||
return p->fetch_page(builder, page_size, now);
|
||||
}
|
||||
).then([this, &builder, restrictions_need_filtering] {
|
||||
).then([this, &builder] {
|
||||
auto rs = builder.build();
|
||||
if (restrictions_need_filtering) {
|
||||
_stats.filtered_rows_matched_total += rs->size();
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
@@ -444,18 +431,12 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
" you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
|
||||
}
|
||||
|
||||
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||
if (_selection->is_trivial() && !restrictions_need_filtering) {
|
||||
return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
|
||||
auto meta = [&] () -> shared_ptr<const cql3::metadata> {
|
||||
if (!p->is_exhausted()) {
|
||||
auto meta = make_shared<metadata>(*_selection->get_result_metadata());
|
||||
meta->set_paging_state(p->state());
|
||||
return meta;
|
||||
} else {
|
||||
return _selection->get_result_metadata();
|
||||
}
|
||||
}();
|
||||
if (_selection->is_trivial()) {
|
||||
return p->fetch_page_generator(page_size, now, _stats).then([this, p, limit] (result_generator generator) {
|
||||
auto meta = make_shared<metadata>(*_selection->get_result_metadata());
|
||||
if (!p->is_exhausted()) {
|
||||
meta->set_has_more_pages(p->state());
|
||||
}
|
||||
|
||||
return shared_ptr<cql_transport::messages::result_message>(
|
||||
make_shared<cql_transport::messages::result_message::rows>(result(std::move(generator), std::move(meta)))
|
||||
@@ -463,220 +444,19 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
});
|
||||
}
|
||||
|
||||
return p->fetch_page(page_size, now, timeout).then(
|
||||
[this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {
|
||||
return p->fetch_page(page_size, now).then(
|
||||
[this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
|
||||
|
||||
if (!p->is_exhausted()) {
|
||||
rs->get_metadata().set_paging_state(p->state());
|
||||
rs->get_metadata().set_has_more_pages(p->state());
|
||||
}
|
||||
|
||||
if (restrictions_need_filtering) {
|
||||
_stats.filtered_rows_matched_total += rs->size();
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
}
|
||||
|
||||
template<typename KeyType>
|
||||
GCC6_CONCEPT(
|
||||
requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
|
||||
)
|
||||
static KeyType
|
||||
generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_key& index_ck, const schema& base_schema, const schema& view_schema) {
|
||||
const auto& base_columns = std::is_same_v<KeyType, partition_key> ? base_schema.partition_key_columns() : base_schema.clustering_key_columns();
|
||||
std::vector<bytes_view> exploded_base_key;
|
||||
exploded_base_key.reserve(base_columns.size());
|
||||
|
||||
for (const column_definition& base_col : base_columns) {
|
||||
const column_definition* view_col = view_schema.view_info()->view_column(base_col);
|
||||
if (view_col->is_partition_key()) {
|
||||
exploded_base_key.push_back(index_pk.get_component(view_schema, view_col->id));
|
||||
} else {
|
||||
exploded_base_key.push_back(index_ck.get_component(view_schema, view_col->id));
|
||||
}
|
||||
}
|
||||
return KeyType::from_range(exploded_base_key);
|
||||
}
|
||||
|
||||
lw_shared_ptr<query::read_command>
|
||||
indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
|
||||
lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
make_partition_slice(options),
|
||||
get_limit(options),
|
||||
now,
|
||||
tracing::make_trace_info(state.get_trace_state()),
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
if (use_paging) {
|
||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
|
||||
if (_schema->clustering_key_size() > 0) {
|
||||
cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
|
||||
}
|
||||
}
|
||||
return cmd;
|
||||
}
|
||||
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
indexed_table_select_statement::do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
dht::partition_range_vector per_vnode_ranges;
|
||||
per_vnode_ranges.reserve(partition_ranges.size());
|
||||
for (auto& pr : partition_ranges) {
|
||||
auto restricted_ranges = proxy.get_restricted_ranges(*_schema, pr);
|
||||
std::move(restricted_ranges.begin(), restricted_ranges.end(), std::back_inserter(per_vnode_ranges));
|
||||
}
|
||||
|
||||
struct base_query_state {
|
||||
query::result_merger merger;
|
||||
dht::partition_range_vector per_vnode_ranges;
|
||||
dht::partition_range_vector::iterator current_partition_range;
|
||||
base_query_state(uint32_t row_limit, dht::partition_range_vector&& ranges)
|
||||
: merger(row_limit * ranges.size(), query::max_partitions)
|
||||
, per_vnode_ranges(std::move(ranges))
|
||||
, current_partition_range(per_vnode_ranges.begin())
|
||||
{}
|
||||
base_query_state(base_query_state&&) = default;
|
||||
base_query_state(const base_query_state&) = delete;
|
||||
};
|
||||
|
||||
base_query_state query_state{cmd->row_limit, std::move(per_vnode_ranges)};
|
||||
return do_with(std::move(query_state), [this, &proxy, &state, &options, cmd, timeout] (auto&& query_state) {
|
||||
auto &merger = query_state.merger;
|
||||
auto &ranges = query_state.per_vnode_ranges;
|
||||
auto &range_it = query_state.current_partition_range;
|
||||
return repeat([this, &ranges, &range_it, &merger, &proxy, &state, &options, cmd, timeout]() {
|
||||
// Starting with 1 range, we check if the result was a short read, and if not,
|
||||
// we continue exponentially, asking for 2x more ranges than before
|
||||
auto range_it_end = std::min(range_it + std::distance(ranges.begin(), range_it) + 1, ranges.end());
|
||||
dht::partition_range_vector prange(range_it, range_it_end);
|
||||
auto command = ::make_lw_shared<query::read_command>(*cmd);
|
||||
auto old_paging_state = options.get_paging_state();
|
||||
if (old_paging_state && range_it == ranges.begin()) {
|
||||
auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
|
||||
*old_paging_state->get_clustering_key(), *_schema, *_view_schema);
|
||||
auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
|
||||
*old_paging_state->get_clustering_key(), *_schema, *_view_schema);
|
||||
command->slice.set_range(*_schema, base_pk,
|
||||
std::vector<query::clustering_range>{query::clustering_range::make_starting_with(range_bound<clustering_key>(base_ck, false))});
|
||||
}
|
||||
return proxy.query(_schema, command, std::move(prange), options.get_consistency(), {timeout, state.get_trace_state()})
|
||||
.then([&range_it, range_it_end = std::move(range_it_end), &ranges, &merger] (service::storage_proxy::coordinator_query_result qr) {
|
||||
bool is_short_read = qr.query_result->is_short_read();
|
||||
merger(std::move(qr.query_result));
|
||||
range_it = range_it_end;
|
||||
return stop_iteration(is_short_read || range_it == ranges.end());
|
||||
});
|
||||
}).then([&merger]() {
|
||||
return merger.get();
|
||||
});
|
||||
}).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
|
||||
});
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
return do_execute_base_query(proxy, std::move(partition_ranges), state, options, now, paging_state).then(
|
||||
[this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
|
||||
return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
|
||||
});
|
||||
}
|
||||
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
indexed_table_select_statement::do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
|
||||
struct base_query_state {
|
||||
query::result_merger merger;
|
||||
std::vector<primary_key> primary_keys;
|
||||
std::vector<primary_key>::iterator current_primary_key;
|
||||
base_query_state(uint32_t row_limit, std::vector<primary_key>&& keys)
|
||||
: merger(row_limit, query::max_partitions)
|
||||
, primary_keys(std::move(keys))
|
||||
, current_primary_key(primary_keys.begin())
|
||||
{}
|
||||
base_query_state(base_query_state&&) = default;
|
||||
base_query_state(const base_query_state&) = delete;
|
||||
};
|
||||
|
||||
base_query_state query_state{cmd->row_limit, std::move(primary_keys)};
|
||||
return do_with(std::move(query_state), [this, &proxy, &state, &options, cmd, timeout] (auto&& query_state) {
|
||||
auto &merger = query_state.merger;
|
||||
auto &keys = query_state.primary_keys;
|
||||
auto &key_it = query_state.current_primary_key;
|
||||
return repeat([this, &keys, &key_it, &merger, &proxy, &state, &options, cmd, timeout]() {
|
||||
// Starting with 1 key, we check if the result was a short read, and if not,
|
||||
// we continue exponentially, asking for 2x more key than before
|
||||
auto key_it_end = std::min(key_it + std::distance(keys.begin(), key_it) + 1, keys.end());
|
||||
auto command = ::make_lw_shared<query::read_command>(*cmd);
|
||||
|
||||
query::result_merger oneshot_merger(cmd->row_limit, query::max_partitions);
|
||||
return map_reduce(key_it, key_it_end, [this, &proxy, &state, &options, cmd, timeout] (auto& key) {
|
||||
auto command = ::make_lw_shared<query::read_command>(*cmd);
|
||||
// for each partition, read just one clustering row (TODO: can
|
||||
// get all needed rows of one partition at once.)
|
||||
command->slice._row_ranges.clear();
|
||||
if (key.clustering) {
|
||||
command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
|
||||
}
|
||||
return proxy.query(_schema, command, {dht::partition_range::make_singular(key.partition)}, options.get_consistency(), {timeout, state.get_trace_state()})
|
||||
.then([] (service::storage_proxy::coordinator_query_result qr) {
|
||||
return std::move(qr.query_result);
|
||||
});
|
||||
}, std::move(oneshot_merger)).then([&key_it, key_it_end = std::move(key_it_end), &keys, &merger] (foreign_ptr<lw_shared_ptr<query::result>> result) {
|
||||
bool is_short_read = result->is_short_read();
|
||||
merger(std::move(result));
|
||||
key_it = key_it_end;
|
||||
return stop_iteration(is_short_read || key_it == keys.end());
|
||||
});
|
||||
}).then([&merger] () {
|
||||
return merger.get();
|
||||
}).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
return do_execute_base_query(proxy, std::move(primary_keys), state, options, now, paging_state).then(
|
||||
[this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
|
||||
return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
|
||||
});
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
select_statement::execute(service::storage_proxy& proxy,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
@@ -716,21 +496,52 @@ select_statement::execute(service::storage_proxy& proxy,
|
||||
}
|
||||
}
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message>
|
||||
indexed_table_select_statement::process_base_query_results(
|
||||
foreign_ptr<lw_shared_ptr<query::result>> results,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
service::storage_proxy& proxy,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state)
|
||||
// Function for fetching the selected columns from a list of clustering rows.
|
||||
// It is currently used only in our Secondary Index implementation - ordinary
|
||||
// CQL SELECT statements do not have the syntax to request a list of rows.
|
||||
// FIXME: The current implementation is very inefficient - it requests each
|
||||
// row separately (and all in parallel). Even multiple rows from a single
|
||||
// partition are requested separately. This last case can be easily improved,
|
||||
// but to implement the general case (multiple rows from multiple partitions)
|
||||
// efficiently, we will need more support from other layers.
|
||||
// Keys are ordered in token order (see #3423)
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
select_statement::execute(service::storage_proxy& proxy,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now)
|
||||
{
|
||||
if (paging_state) {
|
||||
paging_state = generate_view_paging_state_from_base_query_results(paging_state, results, proxy, state, options);
|
||||
_selection->get_result_metadata()->maybe_set_paging_state(std::move(paging_state));
|
||||
}
|
||||
return process_results(std::move(results), std::move(cmd), options, now);
|
||||
// FIXME: pass the timeout from caller. The query has already started
|
||||
// earlier (with read_posting_list()), not now.
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
return do_with(std::move(primary_keys), [this, &proxy, &state, &options, cmd, timeout] (auto& keys) {
|
||||
assert(cmd->partition_limit == query::max_partitions);
|
||||
query::result_merger merger(cmd->row_limit, query::max_partitions);
|
||||
// there is no point to produce rows beyond the first row_limit:
|
||||
auto end = keys.size() <= cmd->row_limit ? keys.end() : keys.begin() + cmd->row_limit;
|
||||
return map_reduce(keys.begin(), end, [this, &proxy, &state, &options, cmd, timeout] (auto& key) {
|
||||
auto command = ::make_lw_shared<query::read_command>(*cmd);
|
||||
// for each partition, read just one clustering row (TODO: can
|
||||
// get all needed rows of one partition at once.)
|
||||
command->slice._row_ranges.clear();
|
||||
if (key.clustering) {
|
||||
command->slice._row_ranges.push_back(query::clustering_range::make_singular(key.clustering));
|
||||
}
|
||||
return proxy.query(_schema,
|
||||
command,
|
||||
{dht::partition_range::make_singular(key.partition)},
|
||||
options.get_consistency(),
|
||||
{timeout, state.get_trace_state()}).then([] (service::storage_proxy::coordinator_query_result qr) {
|
||||
return std::move(qr.query_result);
|
||||
});
|
||||
}, std::move(merger));
|
||||
}).then([this, &options, now, cmd] (auto result) {
|
||||
// note that cmd here still has the garbage clustering range in slice,
|
||||
// but process_results() ignores this part of the slice setting.
|
||||
return this->process_results(std::move(result), cmd, options, now);
|
||||
});
|
||||
}
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message>
|
||||
@@ -739,8 +550,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
const query_options& options,
|
||||
gc_clock::time_point now)
|
||||
{
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
|
||||
bool fast_path = !needs_post_query_ordering() && _selection->is_trivial();
|
||||
if (fast_path) {
|
||||
return make_shared<cql_transport::messages::result_message::rows>(result(
|
||||
result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
|
||||
@@ -750,17 +560,9 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
|
||||
cql3::selection::result_set_builder builder(*_selection, now,
|
||||
options.get_cql_serialization_format());
|
||||
if (restrictions_need_filtering) {
|
||||
results->ensure_counts();
|
||||
_stats.filtered_rows_read_total += *results->row_count();
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
|
||||
} else {
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection));
|
||||
}
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection));
|
||||
auto rs = builder.build();
|
||||
|
||||
if (needs_post_query_ordering()) {
|
||||
@@ -771,7 +573,6 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
rs->trim(cmd->row_limit);
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
}
|
||||
|
||||
@@ -800,16 +601,10 @@ indexed_table_select_statement::prepare(database& db,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
::shared_ptr<term> limit, cql_stats &stats)
|
||||
{
|
||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||
auto index_opt = restrictions->find_idx(sim);
|
||||
auto index_opt = find_idx(db, schema, restrictions);
|
||||
if (!index_opt) {
|
||||
throw std::runtime_error("No index found.");
|
||||
}
|
||||
|
||||
const auto& im = index_opt->metadata();
|
||||
sstring index_table_name = im.name() + "_index";
|
||||
schema_ptr view_schema = db.find_schema(schema->ks_name(), index_table_name);
|
||||
|
||||
return ::make_shared<cql3::statements::indexed_table_select_statement>(
|
||||
schema,
|
||||
bound_terms,
|
||||
@@ -820,11 +615,28 @@ indexed_table_select_statement::prepare(database& db,
|
||||
std::move(ordering_comparator),
|
||||
limit,
|
||||
stats,
|
||||
*index_opt,
|
||||
view_schema);
|
||||
*index_opt);
|
||||
|
||||
}
|
||||
|
||||
|
||||
stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions)
|
||||
{
|
||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
|
||||
for (const auto& cdef : restriction->get_column_defs()) {
|
||||
for (auto index : sim.list_indexes()) {
|
||||
if (index.depends_on(*cdef)) {
|
||||
return stdx::make_optional<secondary_index::index>(std::move(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return stdx::nullopt;
|
||||
}
|
||||
|
||||
indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
|
||||
::shared_ptr<parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
@@ -832,74 +644,16 @@ indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema
|
||||
bool is_reversed,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
::shared_ptr<term> limit, cql_stats &stats,
|
||||
const secondary_index::index& index,
|
||||
schema_ptr view_schema)
|
||||
const secondary_index::index& index)
|
||||
: select_statement{schema, bound_terms, parameters, selection, restrictions, is_reversed, ordering_comparator, limit, stats}
|
||||
, _index{index}
|
||||
, _view_schema(view_schema)
|
||||
{}
|
||||
|
||||
template<typename KeyType>
|
||||
GCC6_CONCEPT(
|
||||
requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key_prefix>)
|
||||
)
|
||||
static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_ck, const KeyType& base_key, const column_definition& index_cdef) {
|
||||
auto key_view = base_key.view();
|
||||
auto begin = key_view.begin();
|
||||
if ((std::is_same_v<KeyType, partition_key> && index_cdef.is_partition_key())
|
||||
|| (std::is_same_v<KeyType, clustering_key_prefix> && index_cdef.is_clustering_key())) {
|
||||
auto key_position = std::next(begin, index_cdef.id);
|
||||
std::move(begin, key_position, std::back_inserter(exploded_index_ck));
|
||||
begin = std::next(key_position);
|
||||
}
|
||||
std::move(begin, key_view.end(), std::back_inserter(exploded_index_ck));
|
||||
}
|
||||
|
||||
::shared_ptr<const service::pager::paging_state> indexed_table_select_statement::generate_view_paging_state_from_base_query_results(::shared_ptr<const service::pager::paging_state> paging_state,
|
||||
const foreign_ptr<lw_shared_ptr<query::result>>& results, service::storage_proxy& proxy, service::query_state& state, const query_options& options) const {
|
||||
const column_definition* cdef = _schema->get_column_definition(to_bytes(_index.target_column()));
|
||||
if (!cdef) {
|
||||
throw exceptions::invalid_request_exception("Indexed column not found in schema");
|
||||
}
|
||||
|
||||
//NOTICE(sarna): Executing indexed_table branch implies there was at least 1 index restriction present
|
||||
bytes_opt index_pk_value = _restrictions->index_restrictions().front()->value_for(*cdef, options);
|
||||
auto index_pk = partition_key::from_single_value(*_view_schema, *index_pk_value);
|
||||
auto result_view = query::result_view(*results);
|
||||
if (!results->row_count() || *results->row_count() == 0) {
|
||||
return std::move(paging_state);
|
||||
}
|
||||
auto [last_base_pk, last_base_ck] = result_view.get_last_partition_and_clustering_key();
|
||||
|
||||
std::vector<bytes_view> exploded_index_ck;
|
||||
exploded_index_ck.reserve(_view_schema->clustering_key_size());
|
||||
|
||||
dht::i_partitioner& partitioner = dht::global_partitioner();
|
||||
bytes token_bytes = partitioner.token_to_bytes(partitioner.get_token(*_schema, last_base_pk));
|
||||
exploded_index_ck.push_back(bytes_view(token_bytes));
|
||||
append_base_key_to_index_ck<partition_key>(exploded_index_ck, last_base_pk, *cdef);
|
||||
if (last_base_ck) {
|
||||
append_base_key_to_index_ck<clustering_key>(exploded_index_ck, *last_base_ck, *cdef);
|
||||
}
|
||||
|
||||
auto index_ck = clustering_key::from_range(std::move(exploded_index_ck));
|
||||
if (partition_key::tri_compare(*_view_schema)(paging_state->get_partition_key(), index_pk) == 0
|
||||
&& (!paging_state->get_clustering_key() || clustering_key::prefix_equal_tri_compare(*_view_schema)(*paging_state->get_clustering_key(), index_ck) == 0)) {
|
||||
return std::move(paging_state);
|
||||
}
|
||||
|
||||
auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
||||
paging_state_copy->set_partition_key(std::move(index_pk));
|
||||
paging_state_copy->set_clustering_key(std::move(index_ck));
|
||||
return std::move(paging_state_copy);
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
|
||||
service::query_state& state,
|
||||
const query_options& options)
|
||||
{
|
||||
tracing::add_table_name(state.get_trace_state(), _view_schema->ks_name(), _view_schema->cf_name());
|
||||
tracing::add_table_name(state.get_trace_state(), keyspace(), column_family());
|
||||
|
||||
auto cl = options.get_consistency();
|
||||
@@ -914,8 +668,6 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
|
||||
|
||||
assert(_restrictions->uses_secondary_indexing());
|
||||
|
||||
_stats.unpaged_select_queries += options.get_page_size() <= 0;
|
||||
|
||||
// Secondary index search has two steps: 1. use the index table to find a
|
||||
// list of primary keys matching the query. 2. read the rows matching
|
||||
// these primary keys from the base table and return the selected columns.
|
||||
@@ -948,199 +700,123 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregated and paged filtering needs to aggregate the results from all pages
|
||||
// in order to avoid returning partial per-page results (issue #4540).
|
||||
// It's a little bit more complicated than regular aggregation, because each paging state
|
||||
// needs to be translated between the base table and the underlying view.
|
||||
// The routine below keeps fetching pages from the underlying view, which are then
|
||||
// used to fetch base rows, which go straight to the result set builder.
|
||||
// A local, internal copy of query_options is kept in order to keep updating
|
||||
// the paging state between requesting data from replicas.
|
||||
const bool aggregate = _selection->is_aggregate();
|
||||
if (aggregate) {
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
|
||||
[this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
|
||||
// page size is set to the internal count page size, regardless of the user-provided value
|
||||
internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
|
||||
return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
|
||||
auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
|
||||
if (restrictions_need_filtering) {
|
||||
query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
|
||||
cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
|
||||
} else {
|
||||
query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
|
||||
}
|
||||
};
|
||||
|
||||
if (whole_partitions || partition_slices) {
|
||||
return find_index_partition_ranges(proxy, state, *internal_options).then(
|
||||
[this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
|
||||
internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
|
||||
return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
|
||||
return stop_iteration(!has_more_pages);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
return find_index_clustering_rows(proxy, state, *internal_options).then(
|
||||
[this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
|
||||
internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
|
||||
return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
|
||||
return stop_iteration(!has_more_pages);
|
||||
});
|
||||
});
|
||||
}
|
||||
}).then([this, &builder, restrictions_need_filtering] () {
|
||||
auto rs = builder.build();
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (whole_partitions || partition_slices) {
|
||||
// In this case, can use our normal query machinery, which retrieves
|
||||
// entire partitions or the same slice for many partitions.
|
||||
return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
return this->execute_base_query(proxy, std::move(partition_ranges), state, options, now, std::move(paging_state));
|
||||
return find_index_partition_ranges(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (dht::partition_range_vector partition_ranges) {
|
||||
auto command = ::make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
make_partition_slice(options),
|
||||
limit,
|
||||
now,
|
||||
tracing::make_trace_info(state.get_trace_state()),
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
return this->execute(proxy, command, std::move(partition_ranges), state, options, now);
|
||||
});
|
||||
} else {
|
||||
// In this case, we need to retrieve a list of rows (not entire
|
||||
// partitions) and then retrieve those specific rows.
|
||||
return find_index_clustering_rows(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
return this->execute_base_query(proxy, std::move(primary_keys), state, options, now, std::move(paging_state));
|
||||
return find_index_clustering_rows(proxy, state, options).then([limit, now, &state, &options, &proxy, this] (std::vector<primary_key> primary_keys) {
|
||||
auto command = ::make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
// Note: the "clustering bounds" set in make_partition_slice()
|
||||
// here is garbage, and will be overridden by execute() anyway
|
||||
make_partition_slice(options),
|
||||
limit,
|
||||
now,
|
||||
tracing::make_trace_info(state.get_trace_state()),
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
return this->execute(proxy, command, std::move(primary_keys), state, options, now);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Utility function for getting the schema of the materialized view used for
|
||||
// the secondary index implementation.
|
||||
static schema_ptr
|
||||
get_index_schema(service::storage_proxy& proxy,
|
||||
const secondary_index::index& index,
|
||||
const schema_ptr& schema,
|
||||
tracing::trace_state_ptr& trace_state)
|
||||
{
|
||||
const auto& im = index.metadata();
|
||||
sstring index_table_name = im.name() + "_index";
|
||||
tracing::add_table_name(trace_state, schema->ks_name(), index_table_name);
|
||||
return proxy.get_db().local().find_schema(schema->ks_name(), index_table_name);
|
||||
}
|
||||
|
||||
// Utility function for reading from the index view (get_index_view()))
|
||||
// the posting-list for a particular value of the indexed column.
|
||||
// Remember a secondary index can only be created on a single column.
|
||||
template<typename KeyType>
|
||||
GCC6_CONCEPT(
|
||||
requires (std::is_same_v<KeyType, partition_key> || std::is_same_v<KeyType, clustering_key>)
|
||||
)
|
||||
static future<::shared_ptr<cql_transport::messages::result_message::rows>>
|
||||
static future<service::storage_proxy::coordinator_query_result>
|
||||
read_posting_list(service::storage_proxy& proxy,
|
||||
schema_ptr view_schema,
|
||||
schema_ptr base_schema,
|
||||
const secondary_index::index& index,
|
||||
::shared_ptr<restrictions::statement_restrictions> base_restrictions,
|
||||
const std::vector<::shared_ptr<restrictions::restrictions>>& index_restrictions,
|
||||
const query_options& options,
|
||||
int32_t limit,
|
||||
service::query_state& state,
|
||||
gc_clock::time_point now,
|
||||
db::timeout_clock::time_point timeout,
|
||||
cql3::cql_stats& stats)
|
||||
db::timeout_clock::time_point timeout)
|
||||
{
|
||||
dht::partition_range_vector partition_ranges;
|
||||
// FIXME: there should be only one index restriction for this index!
|
||||
// Perhaps even one index restriction entirely (do we support
|
||||
// intersection queries?).
|
||||
for (const auto& restrictions : base_restrictions->index_restrictions()) {
|
||||
const column_definition* cdef = base_schema->get_column_definition(to_bytes(index.target_column()));
|
||||
if (!cdef) {
|
||||
throw exceptions::invalid_request_exception("Indexed column not found in schema");
|
||||
}
|
||||
|
||||
bytes_opt value = restrictions->value_for(*cdef, options);
|
||||
if (value) {
|
||||
auto pk = partition_key::from_single_value(*view_schema, *value);
|
||||
auto dk = dht::global_partitioner().decorate_key(*view_schema, pk);
|
||||
auto range = dht::partition_range::make_singular(dk);
|
||||
partition_ranges.emplace_back(range);
|
||||
}
|
||||
for (const auto& restriction : index_restrictions) {
|
||||
auto pk = partition_key::from_optional_exploded(*view_schema, restriction->values(options));
|
||||
auto dk = dht::global_partitioner().decorate_key(*view_schema, pk);
|
||||
auto range = dht::partition_range::make_singular(dk);
|
||||
partition_ranges.emplace_back(range);
|
||||
}
|
||||
|
||||
partition_slice_builder partition_slice_builder{*view_schema};
|
||||
|
||||
if (!base_restrictions->has_partition_key_unrestricted_components()) {
|
||||
auto single_pk_restrictions = dynamic_pointer_cast<restrictions::single_column_primary_key_restrictions<partition_key>>(base_restrictions->get_partition_key_restrictions());
|
||||
// Only EQ restrictions on base partition key can be used in an index view query
|
||||
if (single_pk_restrictions && single_pk_restrictions->is_all_eq()) {
|
||||
auto clustering_restrictions = ::make_shared<restrictions::single_column_primary_key_restrictions<clustering_key_prefix>>(view_schema, *single_pk_restrictions);
|
||||
// Computed token column needs to be added to index view restrictions
|
||||
const column_definition& token_cdef = *view_schema->clustering_key_columns().begin();
|
||||
auto base_pk = partition_key::from_optional_exploded(*base_schema, base_restrictions->get_partition_key_restrictions()->values(options));
|
||||
bytes token_value = dht::global_partitioner().token_to_bytes(dht::global_partitioner().get_token(*base_schema, base_pk));
|
||||
auto token_restriction = ::make_shared<restrictions::single_column_restriction::EQ>(token_cdef, ::make_shared<cql3::constants::value>(cql3::raw_value::make_value(token_value)));
|
||||
clustering_restrictions->merge_with(token_restriction);
|
||||
|
||||
if (base_restrictions->get_clustering_columns_restrictions()->prefix_size() > 0) {
|
||||
auto single_ck_restrictions = dynamic_pointer_cast<restrictions::single_column_primary_key_restrictions<clustering_key>>(base_restrictions->get_clustering_columns_restrictions());
|
||||
if (single_ck_restrictions) {
|
||||
auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
|
||||
auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_primary_key_restrictions<clustering_key_prefix>>(view_schema, *prefix_restrictions);
|
||||
for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
|
||||
clustering_restrictions->merge_with(restriction_it.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
partition_slice_builder.with_ranges(clustering_restrictions->bounds_ranges(options));
|
||||
}
|
||||
}
|
||||
|
||||
auto partition_slice = partition_slice_builder.build();
|
||||
auto cmd = ::make_lw_shared<query::read_command>(
|
||||
view_schema->id(),
|
||||
view_schema->version(),
|
||||
partition_slice,
|
||||
partition_slice_builder.build(),
|
||||
limit,
|
||||
now,
|
||||
tracing::make_trace_info(state.get_trace_state()),
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
|
||||
std::vector<const column_definition*> columns;
|
||||
for (const column_definition& cdef : base_schema->partition_key_columns()) {
|
||||
columns.emplace_back(view_schema->get_column_definition(cdef.name()));
|
||||
}
|
||||
if constexpr (std::is_same_v<KeyType, clustering_key>) {
|
||||
for (const column_definition& cdef : base_schema->clustering_key_columns()) {
|
||||
columns.emplace_back(view_schema->get_column_definition(cdef.name()));
|
||||
}
|
||||
}
|
||||
auto selection = selection::selection::for_columns(view_schema, columns);
|
||||
|
||||
int32_t page_size = options.get_page_size();
|
||||
if (page_size <= 0 || !service::pager::query_pagers::may_need_paging(*view_schema, page_size, *cmd, partition_ranges)) {
|
||||
stats.unpaged_select_queries += 1;
|
||||
return proxy.query(view_schema, cmd, std::move(partition_ranges), options.get_consistency(), {timeout, state.get_trace_state()})
|
||||
.then([base_schema, view_schema, now, &options, selection = std::move(selection), partition_slice = std::move(partition_slice)] (service::storage_proxy::coordinator_query_result qr) {
|
||||
cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
|
||||
query::result_view::consume(*qr.query_result,
|
||||
std::move(partition_slice),
|
||||
cql3::selection::result_set_builder::visitor(builder, *view_schema, *selection));
|
||||
return ::make_shared<cql_transport::messages::result_message::rows>(std::move(result(builder.build())));
|
||||
});
|
||||
}
|
||||
|
||||
auto p = service::pager::query_pagers::pager(view_schema, selection,
|
||||
state, options, cmd, std::move(partition_ranges), stats, nullptr);
|
||||
return p->fetch_page(options.get_page_size(), now, timeout).then([p, &options, limit, now] (std::unique_ptr<cql3::result_set> rs) {
|
||||
rs->get_metadata().set_paging_state(p->state());
|
||||
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
});
|
||||
return proxy.query(view_schema,
|
||||
cmd,
|
||||
std::move(partition_ranges),
|
||||
options.get_consistency(),
|
||||
{timeout, state.get_trace_state()});
|
||||
}
|
||||
|
||||
// Note: the partitions keys returned by this function are sorted
|
||||
// in token order. See issue #3423.
|
||||
future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>>
|
||||
future<dht::partition_range_vector>
|
||||
indexed_table_select_statement::find_index_partition_ranges(service::storage_proxy& proxy,
|
||||
service::query_state& state,
|
||||
const query_options& options)
|
||||
{
|
||||
schema_ptr view = get_index_schema(proxy, _index, _schema, state.get_trace_state());
|
||||
auto now = gc_clock::now();
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
return read_posting_list<partition_key>(proxy, _view_schema, _schema, _index, _restrictions, options, get_limit(options), state, now, timeout, _stats).then(
|
||||
[this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
|
||||
auto rs = cql3::untyped_result_set(rows);
|
||||
return read_posting_list(proxy, view, _restrictions->index_restrictions(), options, get_limit(options), state, now, timeout).then(
|
||||
[this, now, &options, view] (service::storage_proxy::coordinator_query_result qr) {
|
||||
std::vector<const column_definition*> columns;
|
||||
for (const column_definition& cdef : _schema->partition_key_columns()) {
|
||||
columns.emplace_back(view->get_column_definition(cdef.name()));
|
||||
}
|
||||
auto selection = selection::selection::for_columns(view, columns);
|
||||
cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
|
||||
// FIXME: read_posting_list already asks to read primary keys only.
|
||||
// why do we need to specify this again?
|
||||
auto slice = partition_slice_builder(*view).build();
|
||||
query::result_view::consume(*qr.query_result,
|
||||
slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *view, *selection));
|
||||
auto rs = cql3::untyped_result_set(::make_shared<cql_transport::messages::result_message::rows>(std::move(result(builder.build()))));
|
||||
dht::partition_range_vector partition_ranges;
|
||||
partition_ranges.reserve(rs.size());
|
||||
// We are reading the list of primary keys as rows of a single
|
||||
@@ -1166,22 +842,36 @@ indexed_table_select_statement::find_index_partition_ranges(service::storage_pro
|
||||
auto range = dht::partition_range::make_singular(dk);
|
||||
partition_ranges.emplace_back(range);
|
||||
}
|
||||
auto paging_state = rows->rs().get_metadata().paging_state();
|
||||
return make_ready_future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>>(std::move(partition_ranges), std::move(paging_state));
|
||||
return partition_ranges;
|
||||
});
|
||||
}
|
||||
|
||||
// Note: the partitions keys returned by this function are sorted
|
||||
// in token order. See issue #3423.
|
||||
future<std::vector<indexed_table_select_statement::primary_key>, ::shared_ptr<const service::pager::paging_state>>
|
||||
future<std::vector<indexed_table_select_statement::primary_key>>
|
||||
indexed_table_select_statement::find_index_clustering_rows(service::storage_proxy& proxy, service::query_state& state, const query_options& options)
|
||||
{
|
||||
schema_ptr view = get_index_schema(proxy, _index, _schema, state.get_trace_state());
|
||||
auto now = gc_clock::now();
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
return read_posting_list<clustering_key>(proxy, _view_schema, _schema, _index, _restrictions, options, get_limit(options), state, now, timeout, _stats).then(
|
||||
[this, now, &options] (::shared_ptr<cql_transport::messages::result_message::rows> rows) {
|
||||
|
||||
auto rs = cql3::untyped_result_set(rows);
|
||||
return read_posting_list(proxy, view, _restrictions->index_restrictions(), options, get_limit(options), state, now, timeout).then(
|
||||
[this, now, &options, view] (service::storage_proxy::coordinator_query_result qr) {
|
||||
std::vector<const column_definition*> columns;
|
||||
for (const column_definition& cdef : _schema->partition_key_columns()) {
|
||||
columns.emplace_back(view->get_column_definition(cdef.name()));
|
||||
}
|
||||
for (const column_definition& cdef : _schema->clustering_key_columns()) {
|
||||
columns.emplace_back(view->get_column_definition(cdef.name()));
|
||||
}
|
||||
auto selection = selection::selection::for_columns(view, columns);
|
||||
cql3::selection::result_set_builder builder(*selection, now, options.get_cql_serialization_format());
|
||||
// FIXME: read_posting_list already asks to read primary keys only.
|
||||
// why do we need to specify this again?
|
||||
auto slice = partition_slice_builder(*view).build();
|
||||
query::result_view::consume(*qr.query_result,
|
||||
slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *view, *selection));
|
||||
auto rs = cql3::untyped_result_set(::make_shared<cql_transport::messages::result_message::rows>(result(builder.build())));
|
||||
std::vector<primary_key> primary_keys;
|
||||
primary_keys.reserve(rs.size());
|
||||
for (size_t i = 0; i < rs.size(); i++) {
|
||||
@@ -1197,8 +887,7 @@ indexed_table_select_statement::find_index_clustering_rows(service::storage_prox
|
||||
auto ck = clustering_key::from_range(ck_columns);
|
||||
primary_keys.emplace_back(primary_key{std::move(dk), std::move(ck)});
|
||||
}
|
||||
auto paging_state = rows->rs().get_metadata().paging_state();
|
||||
return make_ready_future<std::vector<indexed_table_select_statement::primary_key>, ::shared_ptr<const service::pager::paging_state>>(std::move(primary_keys), std::move(paging_state));
|
||||
return primary_keys;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1264,7 +953,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
|
||||
? selection::selection::wildcard(schema)
|
||||
: selection::selection::from_selectors(db, schema, _select_clause);
|
||||
|
||||
auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view, _parameters->allow_filtering());
|
||||
auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view);
|
||||
|
||||
if (_parameters->is_distinct()) {
|
||||
validate_distinct_selection(schema, selection, restrictions);
|
||||
@@ -1281,7 +970,10 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
|
||||
}
|
||||
|
||||
check_needs_filtering(restrictions);
|
||||
ensure_filtering_columns_retrieval(db, selection, restrictions);
|
||||
size_t restrictions_size = restrictions->get_partition_key_restrictions()->size() + restrictions->get_clustering_columns_restrictions()->size() + restrictions->get_non_pk_restriction().size();
|
||||
if (restrictions->uses_secondary_indexing() && restrictions_size > 1) {
|
||||
throw exceptions::invalid_request_exception("Indexed query may not contain multiple restrictions in 2.3");
|
||||
}
|
||||
|
||||
::shared_ptr<cql3::statements::select_statement> stmt;
|
||||
if (restrictions->uses_secondary_indexing()) {
|
||||
@@ -1319,14 +1011,13 @@ select_statement::prepare_restrictions(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<variable_specifications> bound_names,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
bool for_view,
|
||||
bool allow_filtering)
|
||||
bool for_view)
|
||||
{
|
||||
try {
|
||||
// FIXME: this method should take a separate allow_filtering parameter
|
||||
// and pass it on. Currently we pass "for_view" as allow_filtering.
|
||||
return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
|
||||
selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
|
||||
selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, for_view);
|
||||
} catch (const exceptions::unrecognized_entity_exception& e) {
|
||||
if (contains_alias(e.entity)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Aliases aren't allowed in the where clause ('%s')", e.relation->to_string()));
|
||||
@@ -1420,7 +1111,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
|
||||
}
|
||||
auto index = selection->index_of(*def);
|
||||
if (index < 0) {
|
||||
index = selection->add_column_for_post_processing(*def);
|
||||
index = selection->add_column_for_ordering(*def);
|
||||
}
|
||||
|
||||
sorters.emplace_back(index, def->type);
|
||||
@@ -1507,23 +1198,6 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds columns that are needed for the purpose of filtering to the selection.
|
||||
* The columns that are added to the selection are columns that
|
||||
* are needed for filtering on the coordinator but are not part of the selection.
|
||||
* The columns are added with a meta-data indicating they are not to be returned
|
||||
* to the user.
|
||||
*/
|
||||
void select_statement::ensure_filtering_columns_retrieval(database& db,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions) {
|
||||
for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
|
||||
if (!selection->has_column(*cdef)) {
|
||||
selection->add_column_for_post_processing(*cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
|
||||
return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
|
||||
return raw->alias && *name == *raw->alias;
|
||||
|
||||
@@ -67,8 +67,8 @@ class select_statement : public cql_statement {
|
||||
public:
|
||||
using parameters = raw::select_statement::parameters;
|
||||
using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
|
||||
static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
|
||||
protected:
|
||||
static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
|
||||
static thread_local const ::shared_ptr<parameters> _default_parameters;
|
||||
schema_ptr _schema;
|
||||
uint32_t _bound_terms;
|
||||
@@ -126,6 +126,14 @@ public:
|
||||
clustering_key_prefix clustering;
|
||||
};
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message>> execute(
|
||||
service::storage_proxy& proxy,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now);
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message> process_results(foreign_ptr<lw_shared_ptr<query::result>> results,
|
||||
lw_shared_ptr<query::read_command> cmd, const query_options& options, gc_clock::time_point now);
|
||||
|
||||
@@ -160,7 +168,6 @@ public:
|
||||
|
||||
class indexed_table_select_statement : public select_statement {
|
||||
secondary_index::index _index;
|
||||
schema_ptr _view_schema;
|
||||
public:
|
||||
static ::shared_ptr<cql3::statements::select_statement> prepare(database& db,
|
||||
schema_ptr schema,
|
||||
@@ -182,80 +189,24 @@ public:
|
||||
ordering_comparator_type ordering_comparator,
|
||||
::shared_ptr<term> limit,
|
||||
cql_stats &stats,
|
||||
const secondary_index::index& index,
|
||||
schema_ptr view_schema);
|
||||
const secondary_index::index& index);
|
||||
|
||||
private:
|
||||
static stdx::optional<secondary_index::index> find_idx(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
|
||||
service::query_state& state, const query_options& options) override;
|
||||
|
||||
::shared_ptr<const service::pager::paging_state> generate_view_paging_state_from_base_query_results(::shared_ptr<const service::pager::paging_state> paging_state,
|
||||
const foreign_ptr<lw_shared_ptr<query::result>>& results, service::storage_proxy& proxy, service::query_state& state, const query_options& options) const;
|
||||
|
||||
future<dht::partition_range_vector, ::shared_ptr<const service::pager::paging_state>> find_index_partition_ranges(service::storage_proxy& proxy,
|
||||
future<dht::partition_range_vector> find_index_partition_ranges(service::storage_proxy& proxy,
|
||||
service::query_state& state,
|
||||
const query_options& options);
|
||||
|
||||
future<std::vector<primary_key>, ::shared_ptr<const service::pager::paging_state>> find_index_clustering_rows(service::storage_proxy& proxy,
|
||||
future<std::vector<primary_key>> find_index_clustering_rows(service::storage_proxy& proxy,
|
||||
service::query_state& state,
|
||||
const query_options& options);
|
||||
|
||||
shared_ptr<cql_transport::messages::result_message>
|
||||
process_base_query_results(
|
||||
foreign_ptr<lw_shared_ptr<query::result>> results,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
service::storage_proxy& proxy,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
lw_shared_ptr<query::read_command>
|
||||
prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
|
||||
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
// Function for fetching the selected columns from a list of clustering rows.
|
||||
// It is currently used only in our Secondary Index implementation - ordinary
|
||||
// CQL SELECT statements do not have the syntax to request a list of rows.
|
||||
// FIXME: The current implementation is very inefficient - it requests each
|
||||
// row separately (and, incrementally, in parallel). Even multiple rows from a single
|
||||
// partition are requested separately. This last case can be easily improved,
|
||||
// but to implement the general case (multiple rows from multiple partitions)
|
||||
// efficiently, we will need more support from other layers.
|
||||
// Keys are ordered in token order (see #3423)
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||
do_execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
std::vector<primary_key>&& primary_keys,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
virtual void update_stats_rows_read(int64_t rows_read) override {
|
||||
_stats.rows_read += rows_read;
|
||||
_stats.secondary_index_rows_read += rows_read;
|
||||
|
||||
@@ -84,11 +84,8 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
|
||||
for (const auto& def : expected_receivers) {
|
||||
sstring cql_name = def.name_as_text();
|
||||
auto value_it = prepared_map.find(cql_name);
|
||||
if (value_it == prepared_map.end()) {
|
||||
continue;
|
||||
} else if (value_it->second.isNull()) {
|
||||
if (value_it == prepared_map.end() || value_it->second.isNull()) {
|
||||
json_map.emplace(std::move(cql_name), bytes_opt{});
|
||||
prepared_map.erase(value_it);
|
||||
} else {
|
||||
json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
|
||||
prepared_map.erase(value_it);
|
||||
@@ -175,9 +172,7 @@ void update_statement::add_update_for_key(mutation& m, const query::clustering_r
|
||||
}
|
||||
|
||||
modification_statement::json_cache_opt insert_prepared_json_statement::maybe_prepare_json_cache(const query_options& options) {
|
||||
sstring json_string = with_linearized(_term->bind_and_get(options).data().value(), [&] (bytes_view value) {
|
||||
return utf8_type->to_string(value.to_string());
|
||||
});
|
||||
sstring json_string = utf8_type->to_string(_term->bind_and_get(options).data().value().to_string());
|
||||
return json_helpers::parse(std::move(json_string), s->all_columns(), options.get_cql_serialization_format());
|
||||
}
|
||||
|
||||
@@ -200,20 +195,20 @@ insert_prepared_json_statement::execute_set_value(mutation& m, const clustering_
|
||||
m.set_cell(prefix, column, std::move(operation::make_dead_cell(params)));
|
||||
return;
|
||||
} else if (!column.type->is_collection()) {
|
||||
constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(fragmented_temporary_buffer::view(*value)));
|
||||
constants::setter::execute(m, prefix, params, column, raw_value_view::make_value(bytes_view(*value)));
|
||||
return;
|
||||
}
|
||||
|
||||
auto& k = static_pointer_cast<const collection_type_impl>(column.type)->_kind;
|
||||
cql_serialization_format sf = params._options.get_cql_serialization_format();
|
||||
if (&k == &collection_type_impl::kind::list) {
|
||||
auto list_terminal = make_shared<lists::value>(lists::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const list_type_impl>(column.type), sf));
|
||||
auto list_terminal = make_shared<lists::value>(lists::value::from_serialized(*value, dynamic_pointer_cast<const list_type_impl>(column.type), sf));
|
||||
lists::setter::execute(m, prefix, params, column, std::move(list_terminal));
|
||||
} else if (&k == &collection_type_impl::kind::set) {
|
||||
auto set_terminal = make_shared<sets::value>(sets::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const set_type_impl>(column.type), sf));
|
||||
auto set_terminal = make_shared<sets::value>(sets::value::from_serialized(*value, dynamic_pointer_cast<const set_type_impl>(column.type), sf));
|
||||
sets::setter::execute(m, prefix, params, column, std::move(set_terminal));
|
||||
} else if (&k == &collection_type_impl::kind::map) {
|
||||
auto map_terminal = make_shared<maps::value>(maps::value::from_serialized(fragmented_temporary_buffer::view(*value), dynamic_pointer_cast<const map_type_impl>(column.type), sf));
|
||||
auto map_terminal = make_shared<maps::value>(maps::value::from_serialized(*value, dynamic_pointer_cast<const map_type_impl>(column.type), sf));
|
||||
maps::setter::execute(m, prefix, params, column, std::move(map_terminal));
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception("Incorrect value kind in JSON INSERT statement");
|
||||
@@ -258,12 +253,8 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
|
||||
throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
|
||||
}
|
||||
|
||||
auto it = json_cache->find(def.name_as_text());
|
||||
if (it != json_cache->end()) {
|
||||
execute_set_value(m, prefix, params, def, it->second);
|
||||
} else if (!_default_unset) {
|
||||
execute_set_value(m, prefix, params, def, bytes_opt{});
|
||||
}
|
||||
auto value = json_cache->at(def.name_as_text());
|
||||
execute_set_value(m, prefix, params, def, value);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -329,14 +320,12 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
|
||||
insert_json_statement::insert_json_statement( ::shared_ptr<cf_name> name,
|
||||
::shared_ptr<attributes::raw> attrs,
|
||||
::shared_ptr<term::raw> json_value,
|
||||
bool if_not_exists,
|
||||
bool default_unset)
|
||||
bool if_not_exists)
|
||||
: raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
|
||||
, _name(name)
|
||||
, _attrs(attrs)
|
||||
, _json_value(json_value)
|
||||
, _if_not_exists(if_not_exists)
|
||||
, _default_unset(default_unset) { }
|
||||
, _if_not_exists(if_not_exists) { }
|
||||
|
||||
::shared_ptr<cql3::statements::modification_statement>
|
||||
insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
||||
@@ -346,7 +335,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
||||
auto json_column_placeholder = ::make_shared<column_identifier>("", true);
|
||||
auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
|
||||
prepared_json_value->collect_marker_specification(bound_names);
|
||||
return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
|
||||
return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
|
||||
}
|
||||
|
||||
update_statement::update_statement( ::shared_ptr<cf_name> name,
|
||||
|
||||
@@ -82,10 +82,9 @@ private:
|
||||
*/
|
||||
class insert_prepared_json_statement : public update_statement {
|
||||
::shared_ptr<term> _term;
|
||||
bool _default_unset;
|
||||
public:
|
||||
insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
|
||||
: update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
|
||||
insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
|
||||
: update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
|
||||
_restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
|
||||
}
|
||||
private:
|
||||
|
||||
@@ -36,17 +36,11 @@ struct cql_stats {
|
||||
uint64_t batches_pure_unlogged = 0;
|
||||
uint64_t batches_unlogged_from_logged = 0;
|
||||
uint64_t rows_read = 0;
|
||||
uint64_t reverse_queries = 0;
|
||||
uint64_t unpaged_select_queries = 0;
|
||||
|
||||
int64_t secondary_index_creates = 0;
|
||||
int64_t secondary_index_drops = 0;
|
||||
int64_t secondary_index_reads = 0;
|
||||
int64_t secondary_index_rows_read = 0;
|
||||
|
||||
int64_t filtered_reads = 0;
|
||||
int64_t filtered_rows_matched_total = 0;
|
||||
int64_t filtered_rows_read_total = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ public:
|
||||
column->ks_name,
|
||||
column->cf_name,
|
||||
::make_shared<column_identifier>(sprint("%s[%d]", column->name, component), true),
|
||||
static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
|
||||
static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -112,7 +112,7 @@ public:
|
||||
|
||||
private:
|
||||
void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
|
||||
auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
|
||||
auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
|
||||
if (!tt) {
|
||||
throw exceptions::invalid_request_exception(sprint("Invalid tuple type literal for %s of type %s", receiver->name, receiver->type->as_cql3_type()));
|
||||
}
|
||||
@@ -159,10 +159,8 @@ public:
|
||||
_elements.push_back(e ? bytes_opt(bytes(e->begin(), e->size())) : bytes_opt());
|
||||
}
|
||||
}
|
||||
static value from_serialized(const fragmented_temporary_buffer::view& buffer, tuple_type type) {
|
||||
return with_linearized(buffer, [&] (bytes_view view) {
|
||||
return value(type->split(view));
|
||||
});
|
||||
static value from_serialized(bytes_view buffer, tuple_type type) {
|
||||
return value(type->split(buffer));
|
||||
}
|
||||
virtual cql3::raw_value get(const query_options& options) override {
|
||||
return cql3::raw_value::make_value(tuple_type_impl::build_value(_elements));
|
||||
@@ -253,29 +251,20 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static in_value from_serialized(const fragmented_temporary_buffer::view& value_view, list_type type, const query_options& options) {
|
||||
static in_value from_serialized(bytes_view value, list_type type, const query_options& options) {
|
||||
try {
|
||||
// Collections have this small hack that validate cannot be called on a serialized object,
|
||||
// but the deserialization does the validation (so we're fine).
|
||||
return with_linearized(value_view, [&] (bytes_view value) {
|
||||
auto l = value_cast<list_type_impl::native_type>(type->deserialize(value, options.get_cql_serialization_format()));
|
||||
auto ttype = dynamic_pointer_cast<const tuple_type_impl>(type->get_elements_type());
|
||||
assert(ttype);
|
||||
|
||||
std::vector<std::vector<bytes_opt>> elements;
|
||||
std::vector<std::vector<bytes_view_opt>> elements;
|
||||
elements.reserve(l.size());
|
||||
for (auto&& element : l) {
|
||||
auto tuple_buff = ttype->decompose(element);
|
||||
auto tuple = ttype->split(tuple_buff);
|
||||
std::vector<bytes_opt> elems;
|
||||
elems.reserve(tuple.size());
|
||||
for (auto&& e : tuple) {
|
||||
elems.emplace_back(to_bytes_opt(e));
|
||||
}
|
||||
elements.emplace_back(std::move(elems));
|
||||
elements.emplace_back(ttype->split(ttype->decompose(element)));
|
||||
}
|
||||
return in_value(elements);
|
||||
});
|
||||
} catch (marshal_exception& e) {
|
||||
throw exceptions::invalid_request_exception(e.what());
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ public:
|
||||
return atomic_cell::make_dead(_timestamp, _local_deletion_time);
|
||||
}
|
||||
|
||||
atomic_cell make_cell(const abstract_type& type, const fragmented_temporary_buffer::view& value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
|
||||
atomic_cell make_cell(const abstract_type& type, bytes_view value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
|
||||
auto ttl = _ttl;
|
||||
|
||||
if (ttl.count() <= 0) {
|
||||
@@ -156,10 +156,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
atomic_cell make_cell(const abstract_type& type, bytes_view value, atomic_cell::collection_member cm = atomic_cell::collection_member::no) const {
|
||||
return make_cell(type, fragmented_temporary_buffer::view(value), cm);
|
||||
}
|
||||
|
||||
atomic_cell make_counter_update_cell(int64_t delta) const {
|
||||
return atomic_cell::make_live_counter_update(_timestamp, delta);
|
||||
}
|
||||
|
||||
@@ -28,10 +28,6 @@
|
||||
|
||||
#include <experimental/optional>
|
||||
|
||||
#include <seastar/util/variant_utils.hh>
|
||||
|
||||
#include "utils/fragmented_temporary_buffer.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
struct null_value {
|
||||
@@ -44,7 +40,7 @@ struct unset_value {
|
||||
///
|
||||
/// \see raw_value
|
||||
struct raw_value_view {
|
||||
boost::variant<fragmented_temporary_buffer::view, null_value, unset_value> _data;
|
||||
boost::variant<bytes_view, null_value, unset_value> _data;
|
||||
|
||||
raw_value_view(null_value&& data)
|
||||
: _data{std::move(data)}
|
||||
@@ -52,7 +48,10 @@ struct raw_value_view {
|
||||
raw_value_view(unset_value&& data)
|
||||
: _data{std::move(data)}
|
||||
{}
|
||||
raw_value_view(fragmented_temporary_buffer::view data)
|
||||
raw_value_view(bytes_view&& data)
|
||||
: _data{std::move(data)}
|
||||
{}
|
||||
raw_value_view(const bytes_view& data)
|
||||
: _data{data}
|
||||
{}
|
||||
public:
|
||||
@@ -62,7 +61,10 @@ public:
|
||||
static raw_value_view make_unset_value() {
|
||||
return raw_value_view{std::move(unset_value{})};
|
||||
}
|
||||
static raw_value_view make_value(fragmented_temporary_buffer::view view) {
|
||||
static raw_value_view make_value(bytes_view &&view) {
|
||||
return raw_value_view{std::move(view)};
|
||||
}
|
||||
static raw_value_view make_value(const bytes_view& view) {
|
||||
return raw_value_view{view};
|
||||
}
|
||||
bool is_null() const {
|
||||
@@ -74,47 +76,20 @@ public:
|
||||
bool is_value() const {
|
||||
return _data.which() == 0;
|
||||
}
|
||||
std::optional<fragmented_temporary_buffer::view> data() const {
|
||||
bytes_view_opt data() const {
|
||||
if (_data.which() == 0) {
|
||||
return boost::get<fragmented_temporary_buffer::view>(_data);
|
||||
return boost::get<bytes_view>(_data);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
explicit operator bool() const {
|
||||
return _data.which() == 0;
|
||||
}
|
||||
const fragmented_temporary_buffer::view* operator->() const {
|
||||
return &boost::get<fragmented_temporary_buffer::view>(_data);
|
||||
const bytes_view* operator->() const {
|
||||
return &boost::get<bytes_view>(_data);
|
||||
}
|
||||
const fragmented_temporary_buffer::view& operator*() const {
|
||||
return boost::get<fragmented_temporary_buffer::view>(_data);
|
||||
}
|
||||
|
||||
bool operator==(const raw_value_view& other) const {
|
||||
if (_data.which() != other._data.which()) {
|
||||
return false;
|
||||
}
|
||||
if (is_value() && **this != *other) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool operator!=(const raw_value_view& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
|
||||
seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
|
||||
os << "{ value: ";
|
||||
using boost::range::for_each;
|
||||
for_each(v, [&os] (bytes_view bv) { os << bv; });
|
||||
os << " }";
|
||||
}, [&] (null_value) {
|
||||
os << "{ null }";
|
||||
}, [&] (unset_value) {
|
||||
os << "{ unset }";
|
||||
});
|
||||
return os;
|
||||
const bytes_view& operator*() const {
|
||||
return boost::get<bytes_view>(_data);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -152,7 +127,7 @@ public:
|
||||
if (view.is_unset_value()) {
|
||||
return make_unset_value();
|
||||
}
|
||||
return make_value(linearized(*view));
|
||||
return make_value(to_bytes(*view));
|
||||
}
|
||||
static raw_value make_value(bytes&& bytes) {
|
||||
return raw_value{std::move(bytes)};
|
||||
@@ -192,7 +167,7 @@ public:
|
||||
}
|
||||
raw_value_view to_view() const {
|
||||
switch (_data.which()) {
|
||||
case 0: return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{boost::get<bytes>(_data)}));
|
||||
case 0: return raw_value_view::make_value(bytes_view{boost::get<bytes>(_data)});
|
||||
case 1: return raw_value_view::make_null();
|
||||
default: return raw_value_view::make_unset_value();
|
||||
}
|
||||
@@ -201,19 +176,10 @@ public:
|
||||
|
||||
}
|
||||
|
||||
inline bytes to_bytes(const cql3::raw_value_view& view)
|
||||
{
|
||||
return linearized(*view);
|
||||
}
|
||||
|
||||
inline bytes_opt to_bytes_opt(const cql3::raw_value_view& view) {
|
||||
auto buffer_view = view.data();
|
||||
if (buffer_view) {
|
||||
return bytes_opt(linearized(*buffer_view));
|
||||
}
|
||||
return bytes_opt();
|
||||
return to_bytes_opt(view.data());
|
||||
}
|
||||
|
||||
inline bytes_opt to_bytes_opt(const cql3::raw_value& value) {
|
||||
return to_bytes_opt(value.to_view());
|
||||
return value.data();
|
||||
}
|
||||
|
||||
703
database.cc
703
database.cc
File diff suppressed because it is too large
Load Diff
114
database.hh
114
database.hh
@@ -77,7 +77,6 @@
|
||||
#include <seastar/core/metrics_registration.hh>
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "db/view/row_locking.hh"
|
||||
#include "lister.hh"
|
||||
#include "utils/phased_barrier.hh"
|
||||
@@ -165,33 +164,29 @@ private:
|
||||
std::function<schema_ptr()> _current_schema;
|
||||
dirty_memory_manager* _dirty_memory_manager;
|
||||
std::experimental::optional<shared_promise<>> _flush_coalescing;
|
||||
seastar::scheduling_group _compaction_scheduling_group;
|
||||
public:
|
||||
memtable_list(
|
||||
seal_immediate_fn_type seal_immediate_fn,
|
||||
seal_delayed_fn_type seal_delayed_fn,
|
||||
std::function<schema_ptr()> cs,
|
||||
dirty_memory_manager* dirty_memory_manager,
|
||||
seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
|
||||
dirty_memory_manager* dirty_memory_manager)
|
||||
: _memtables({})
|
||||
, _seal_immediate_fn(seal_immediate_fn)
|
||||
, _seal_delayed_fn(seal_delayed_fn)
|
||||
, _current_schema(cs)
|
||||
, _dirty_memory_manager(dirty_memory_manager)
|
||||
, _compaction_scheduling_group(compaction_scheduling_group) {
|
||||
, _dirty_memory_manager(dirty_memory_manager) {
|
||||
add_memtable();
|
||||
}
|
||||
|
||||
memtable_list(
|
||||
seal_immediate_fn_type seal_immediate_fn,
|
||||
std::function<schema_ptr()> cs,
|
||||
dirty_memory_manager* dirty_memory_manager,
|
||||
seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
|
||||
: memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
|
||||
dirty_memory_manager* dirty_memory_manager)
|
||||
: memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager) {
|
||||
}
|
||||
|
||||
memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager, seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
|
||||
: memtable_list({}, {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
|
||||
memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
|
||||
: memtable_list({}, {}, std::move(cs), dirty_memory_manager) {
|
||||
}
|
||||
|
||||
bool may_flush() const {
|
||||
@@ -280,9 +275,6 @@ struct cf_stats {
|
||||
int64_t clustering_filter_fast_path_count = 0;
|
||||
// how many sstables survived the clustering key checks
|
||||
int64_t surviving_sstables_after_clustering_filter = 0;
|
||||
|
||||
// How many view updates were dropped due to overload.
|
||||
int64_t dropped_view_updates = 0;
|
||||
};
|
||||
|
||||
class cache_temperature {
|
||||
@@ -307,7 +299,6 @@ class database_sstable_write_monitor;
|
||||
class table : public enable_lw_shared_from_this<table> {
|
||||
public:
|
||||
struct config {
|
||||
std::vector<sstring> all_datadirs;
|
||||
sstring datadir;
|
||||
bool enable_disk_writes = true;
|
||||
bool enable_disk_reads = true;
|
||||
@@ -323,13 +314,11 @@ public:
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group;
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
db::large_partition_handler* large_partition_handler;
|
||||
db::timeout_semaphore* view_update_concurrency_semaphore;
|
||||
size_t view_update_concurrency_semaphore_limit;
|
||||
};
|
||||
struct no_commitlog {};
|
||||
struct stats {
|
||||
@@ -439,15 +428,11 @@ private:
|
||||
// but for correct compaction we need to start the compaction only after
|
||||
// reading all sstables.
|
||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
|
||||
// sstables that should not be compacted (e.g. because they need to be used
|
||||
// to generate view updates later)
|
||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
|
||||
// Control background fibers waiting for sstables to be deleted
|
||||
seastar::gate _sstable_deletion_gate;
|
||||
// This semaphore ensures that an operation like snapshot won't have its selected
|
||||
// sstables deleted by compaction in parallel, a race condition which could
|
||||
// easily result in failure.
|
||||
// Locking order: must be acquired either independently or after _sstables_lock
|
||||
seastar::semaphore _sstable_deletion_sem = {1};
|
||||
// There are situations in which we need to stop writing sstables. Flushers will take
|
||||
// the read lock, and the ones that wish to stop that process will take the write lock.
|
||||
@@ -498,13 +483,6 @@ private:
|
||||
utils::phased_barrier _pending_writes_phaser;
|
||||
// Corresponding phaser for in-progress reads.
|
||||
utils::phased_barrier _pending_reads_phaser;
|
||||
public:
|
||||
future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
|
||||
void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
|
||||
sstables::shared_sstable get_staging_sstable(uint64_t generation) {
|
||||
auto it = _sstables_staging.find(generation);
|
||||
return it != _sstables_staging.end() ? it->second : nullptr;
|
||||
}
|
||||
private:
|
||||
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
|
||||
// Adds new sstable to the set of sstables
|
||||
@@ -557,7 +535,7 @@ private:
|
||||
void rebuild_statistics();
|
||||
|
||||
// This function replaces new sstables by their ancestors, which are sstables that needed resharding.
|
||||
void replace_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors, std::vector<sstables::shared_sstable> new_sstables);
|
||||
void replace_ancestors_needed_rewrite(std::vector<sstables::shared_sstable> new_sstables);
|
||||
void remove_ancestors_needed_rewrite(std::unordered_set<uint64_t> ancestors);
|
||||
private:
|
||||
mutation_source_opt _virtual_reader;
|
||||
@@ -638,14 +616,6 @@ public:
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||
flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
|
||||
sstables::shared_sstable sst,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc = default_priority_class(),
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||
|
||||
flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
|
||||
auto& full_slice = schema->full_slice();
|
||||
@@ -660,13 +630,7 @@ public:
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema,
|
||||
const dht::partition_range_vector& ranges) const;
|
||||
|
||||
sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
|
||||
sstables::shared_sstable make_streaming_staging_sstable() {
|
||||
return make_streaming_sstable_for_write("staging");
|
||||
}
|
||||
|
||||
mutation_source as_mutation_source() const;
|
||||
mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;
|
||||
|
||||
void set_virtual_reader(mutation_source virtual_reader) {
|
||||
_virtual_reader = std::move(virtual_reader);
|
||||
@@ -720,7 +684,7 @@ public:
|
||||
query::result_memory_limiter& memory_limiter,
|
||||
uint64_t max_result_size,
|
||||
db::timeout_clock::time_point timeout = db::no_timeout,
|
||||
query::querier_cache_context cache_ctx = { });
|
||||
querier_cache_context cache_ctx = { });
|
||||
|
||||
void start();
|
||||
future<> stop();
|
||||
@@ -738,7 +702,13 @@ public:
|
||||
|
||||
// SSTable writes are now allowed again, and generation is updated to new_generation if != -1
|
||||
// returns the amount of microseconds elapsed since we disabled writes.
|
||||
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);
|
||||
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
|
||||
if (new_generation != -1) {
|
||||
update_sstables_known_generation(new_generation);
|
||||
}
|
||||
_sstables_lock.write_unlock();
|
||||
return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
|
||||
}
|
||||
|
||||
// Make sure the generation numbers are sequential, starting from "start".
|
||||
// Generations before "start" are left untouched.
|
||||
@@ -868,8 +838,6 @@ public:
|
||||
void clear_views();
|
||||
const std::vector<view_ptr>& views() const;
|
||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
|
||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
|
||||
future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
|
||||
void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
|
||||
std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);
|
||||
|
||||
@@ -887,17 +855,13 @@ public:
|
||||
dht::token base_token,
|
||||
flat_mutation_reader&&);
|
||||
|
||||
reader_concurrency_semaphore& read_concurrency_semaphore() {
|
||||
return *_config.read_concurrency_semaphore;
|
||||
}
|
||||
|
||||
private:
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
|
||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
mutation&& m,
|
||||
flat_mutation_reader_opt existings) const;
|
||||
flat_mutation_reader_opt existings,
|
||||
db::timeout_clock::time_point timeout) const;
|
||||
|
||||
mutable row_locker _row_locker;
|
||||
future<row_locker::lock_holder> local_base_lock(
|
||||
@@ -1066,7 +1030,6 @@ public:
|
||||
class keyspace {
|
||||
public:
|
||||
struct config {
|
||||
std::vector<sstring> all_datadirs;
|
||||
sstring datadir;
|
||||
bool enable_commitlog = true;
|
||||
bool enable_disk_reads = true;
|
||||
@@ -1082,12 +1045,10 @@ public:
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group;
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
|
||||
size_t view_update_concurrency_semaphore_limit;
|
||||
};
|
||||
private:
|
||||
std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
|
||||
@@ -1145,7 +1106,6 @@ public:
|
||||
return _config.datadir;
|
||||
}
|
||||
|
||||
sstring column_family_directory(const sstring& base_path, const sstring& name, utils::UUID uuid) const;
|
||||
sstring column_family_directory(const sstring& name, utils::UUID uuid) const;
|
||||
};
|
||||
|
||||
@@ -1165,7 +1125,6 @@ struct database_config {
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group; // FIXME: merge with memtable_scheduling_group
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
size_t available_memory;
|
||||
@@ -1189,7 +1148,6 @@ private:
|
||||
static const size_t max_count_system_concurrent_reads{10};
|
||||
size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
|
||||
static constexpr size_t max_concurrent_sstable_loads() { return 3; }
|
||||
size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }
|
||||
|
||||
struct db_stats {
|
||||
uint64_t total_writes = 0;
|
||||
@@ -1201,11 +1159,6 @@ private:
|
||||
|
||||
uint64_t short_data_queries = 0;
|
||||
uint64_t short_mutation_queries = 0;
|
||||
|
||||
uint64_t multishard_query_unpopped_fragments = 0;
|
||||
uint64_t multishard_query_unpopped_bytes = 0;
|
||||
uint64_t multishard_query_failed_reader_stops = 0;
|
||||
uint64_t multishard_query_failed_reader_saves = 0;
|
||||
};
|
||||
|
||||
lw_shared_ptr<db_stats> _stats;
|
||||
@@ -1226,11 +1179,11 @@ private:
|
||||
|
||||
semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};
|
||||
|
||||
db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};
|
||||
db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
|
||||
|
||||
cache_tracker _row_cache_tracker;
|
||||
|
||||
inheriting_concrete_execution_stage<future<lw_shared_ptr<query::result>>,
|
||||
concrete_execution_stage<future<lw_shared_ptr<query::result>>,
|
||||
column_family*,
|
||||
schema_ptr,
|
||||
const query::read_command&,
|
||||
@@ -1240,17 +1193,10 @@ private:
|
||||
query::result_memory_limiter&,
|
||||
uint64_t,
|
||||
db::timeout_clock::time_point,
|
||||
query::querier_cache_context> _data_query_stage;
|
||||
querier_cache_context> _data_query_stage;
|
||||
|
||||
mutation_query_stage _mutation_query_stage;
|
||||
|
||||
inheriting_concrete_execution_stage<
|
||||
future<>,
|
||||
database*,
|
||||
schema_ptr,
|
||||
const frozen_mutation&,
|
||||
db::timeout_clock::time_point> _apply_stage;
|
||||
|
||||
std::unordered_map<sstring, keyspace> _keyspaces;
|
||||
std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
|
||||
std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
|
||||
@@ -1261,7 +1207,7 @@ private:
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
bool _enable_incremental_backups = false;
|
||||
|
||||
query::querier_cache _querier_cache;
|
||||
querier_cache _querier_cache;
|
||||
|
||||
std::unique_ptr<db::large_partition_handler> _large_partition_handler;
|
||||
|
||||
@@ -1433,12 +1379,6 @@ public:
|
||||
std::unordered_set<sstring> get_initial_tokens();
|
||||
std::experimental::optional<gms::inet_address> get_replace_address();
|
||||
bool is_replacing();
|
||||
reader_concurrency_semaphore& user_read_concurrency_sem() {
|
||||
return _read_concurrency_sem;
|
||||
}
|
||||
reader_concurrency_semaphore& streaming_read_concurrency_sem() {
|
||||
return _streaming_concurrency_sem;
|
||||
}
|
||||
reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
|
||||
return _system_read_concurrency_sem;
|
||||
}
|
||||
@@ -1455,25 +1395,15 @@ public:
|
||||
_querier_cache.set_entry_ttl(entry_ttl);
|
||||
}
|
||||
|
||||
const query::querier_cache::stats& get_querier_cache_stats() const {
|
||||
const querier_cache::stats& get_querier_cache_stats() const {
|
||||
return _querier_cache.get_stats();
|
||||
}
|
||||
|
||||
query::querier_cache& get_querier_cache() {
|
||||
return _querier_cache;
|
||||
}
|
||||
|
||||
db::view::update_backlog get_view_update_backlog() const {
|
||||
return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
|
||||
}
|
||||
|
||||
friend class distributed_loader;
|
||||
};
|
||||
|
||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);
|
||||
|
||||
bool is_internal_keyspace(const sstring& name);
|
||||
|
||||
class distributed_loader {
|
||||
public:
|
||||
static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
|
||||
|
||||
@@ -76,7 +76,8 @@ const uint32_t db::batchlog_manager::replay_interval;
|
||||
const uint32_t db::batchlog_manager::page_size;
|
||||
|
||||
db::batchlog_manager::batchlog_manager(cql3::query_processor& qp)
|
||||
: _qp(qp) {
|
||||
: _qp(qp)
|
||||
, _e1(_rd()) {
|
||||
namespace sm = seastar::metrics;
|
||||
|
||||
_metrics.add_group("batchlog_manager", {
|
||||
@@ -116,7 +117,7 @@ future<> db::batchlog_manager::start() {
|
||||
// round-robin scheduling.
|
||||
if (engine().cpu_id() == 0) {
|
||||
_timer.set_callback([this] {
|
||||
do_batch_log_replay().handle_exception([] (auto ep) {
|
||||
return do_batch_log_replay().handle_exception([] (auto ep) {
|
||||
blogger.error("Exception in batch replay: {}", ep);
|
||||
}).finally([this] {
|
||||
_timer.arm(lowres_clock::now() + std::chrono::milliseconds(replay_interval));
|
||||
@@ -267,7 +268,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
|
||||
// send to partially or wholly fail in actually sending stuff. Since we don't
|
||||
// have hints (yet), send with CL=ALL, and hope we can re-do this soon.
|
||||
// See below, we use retry on write failure.
|
||||
return _qp.proxy().mutate(mutations, db::consistency_level::ALL, db::no_timeout, nullptr);
|
||||
return _qp.proxy().mutate(mutations, db::consistency_level::ALL, nullptr);
|
||||
});
|
||||
}).then_wrapped([this, id](future<> batch_result) {
|
||||
try {
|
||||
@@ -395,8 +396,10 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons
|
||||
|
||||
// grab a random member of up to two racks
|
||||
for (auto& rack : racks) {
|
||||
auto rack_members = validated.bucket(rack);
|
||||
auto n = validated.bucket_size(rack_members);
|
||||
auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
|
||||
std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
|
||||
std::uniform_int_distribution<size_t> rdist(0, n - 1);
|
||||
result.emplace(cpy[rdist(_e1)]);
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,8 @@ private:
|
||||
unsigned _cpu = 0;
|
||||
bool _stop = false;
|
||||
|
||||
std::default_random_engine _e1{std::random_device{}()};
|
||||
std::random_device _rd;
|
||||
std::default_random_engine _e1;
|
||||
|
||||
future<> replay_all_failed_batches();
|
||||
public:
|
||||
|
||||
@@ -107,11 +107,6 @@ public:
|
||||
void process_bytes(const char* data, size_t size) {
|
||||
return _c.process(reinterpret_cast<const uint8_t*>(data), size);
|
||||
}
|
||||
template<typename FragmentedBuffer>
|
||||
GCC6_CONCEPT(requires FragmentRange<FragmentedBuffer>)
|
||||
void process_fragmented(const FragmentedBuffer& buffer) {
|
||||
return _c.process_fragmented(buffer);
|
||||
}
|
||||
};
|
||||
|
||||
class db::cf_holder {
|
||||
@@ -313,9 +308,10 @@ public:
|
||||
uint64_t get_num_dirty_segments() const;
|
||||
uint64_t get_num_active_segments() const;
|
||||
|
||||
using buffer_type = fragmented_temporary_buffer;
|
||||
using buffer_type = temporary_buffer<char>;
|
||||
|
||||
buffer_type acquire_buffer(size_t s);
|
||||
void release_buffer(buffer_type&&);
|
||||
|
||||
future<std::vector<descriptor>> list_descriptors(sstring dir);
|
||||
|
||||
@@ -337,6 +333,7 @@ private:
|
||||
segment_id_type _ids = 0;
|
||||
std::vector<sseg_ptr> _segments;
|
||||
queue<sseg_ptr> _reserve_segments;
|
||||
std::vector<buffer_type> _temp_buffers;
|
||||
std::unordered_map<flush_handler_id, flush_handler> _flush_handlers;
|
||||
flush_handler_id _flush_ids = 0;
|
||||
replay_position _flush_position;
|
||||
@@ -347,12 +344,6 @@ private:
|
||||
uint64_t _new_counter = 0;
|
||||
};
|
||||
|
||||
template<typename T, typename Output>
|
||||
static void write(Output& out, T value) {
|
||||
auto v = net::hton(value);
|
||||
out.write(reinterpret_cast<const char*>(&v), sizeof(v));
|
||||
}
|
||||
|
||||
/*
|
||||
* A single commit log file on disk. Manages creation of the file and writing mutations to disk,
|
||||
* as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
|
||||
@@ -407,6 +398,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
|
||||
uint64_t _file_pos = 0;
|
||||
uint64_t _flush_pos = 0;
|
||||
uint64_t _buf_pos = 0;
|
||||
bool _closed = false;
|
||||
|
||||
using buffer_type = segment_manager::buffer_type;
|
||||
@@ -415,7 +407,6 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
using time_point = segment_manager::time_point;
|
||||
|
||||
buffer_type _buffer;
|
||||
fragmented_temporary_buffer::ostream _buffer_ostream;
|
||||
std::unordered_map<cf_id_type, uint64_t> _cf_dirty;
|
||||
time_point _sync_time;
|
||||
seastar::gate _gate;
|
||||
@@ -429,10 +420,6 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
friend std::ostream& operator<<(std::ostream&, const segment&);
|
||||
friend class segment_manager;
|
||||
|
||||
size_t buffer_position() const {
|
||||
return _buffer.size_bytes() - _buffer_ostream.size();
|
||||
}
|
||||
|
||||
future<> begin_flush() {
|
||||
// This is maintaining the semantica of only using the write-lock
|
||||
// as a gate for flushing, i.e. once we've begun a flush for position X
|
||||
@@ -479,7 +466,7 @@ public:
|
||||
clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
|
||||
++_segment_manager->totals.segments_destroyed;
|
||||
_segment_manager->totals.total_size_on_disk -= size_on_disk();
|
||||
_segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
|
||||
_segment_manager->totals.total_size -= (size_on_disk() + _buffer.size());
|
||||
_segment_manager->add_file_to_delete(_file_name, _desc);
|
||||
} else {
|
||||
clogger.warn("Segment {} is dirty and is left on disk.", *this);
|
||||
@@ -620,16 +607,29 @@ public:
|
||||
auto a = align_up(s + overhead, alignment);
|
||||
auto k = std::max(a, default_size);
|
||||
|
||||
_buffer = _segment_manager->acquire_buffer(k);
|
||||
_buffer_ostream = _buffer.get_ostream();
|
||||
auto out = _buffer_ostream.write_substream(overhead);
|
||||
out.fill('\0', overhead);
|
||||
for (;;) {
|
||||
try {
|
||||
_buffer = _segment_manager->acquire_buffer(k);
|
||||
break;
|
||||
} catch (std::bad_alloc&) {
|
||||
clogger.warn("Could not allocate {} k bytes output buffer ({} k required)", k / 1024, a / 1024);
|
||||
if (k > a) {
|
||||
k = std::max(a, k / 2);
|
||||
clogger.debug("Trying reduced size: {} k", k / 1024);
|
||||
continue;
|
||||
}
|
||||
throw;
|
||||
}
|
||||
}
|
||||
_buf_pos = overhead;
|
||||
auto * p = reinterpret_cast<uint32_t *>(_buffer.get_write());
|
||||
std::fill(p, p + overhead, 0);
|
||||
_segment_manager->totals.total_size += k;
|
||||
}
|
||||
|
||||
bool buffer_is_empty() const {
|
||||
return buffer_position() <= segment_overhead_size
|
||||
|| (_file_pos == 0 && buffer_position() <= (segment_overhead_size + descriptor_header_size));
|
||||
return _buf_pos <= segment_overhead_size
|
||||
|| (_file_pos == 0 && _buf_pos <= (segment_overhead_size + descriptor_header_size));
|
||||
}
|
||||
/**
|
||||
* Send any buffer contents to disk and get a new tmp buffer
|
||||
@@ -641,32 +641,35 @@ public:
|
||||
}
|
||||
|
||||
auto size = clear_buffer_slack();
|
||||
auto buf = std::exchange(_buffer, { });
|
||||
auto buf = std::move(_buffer);
|
||||
auto off = _file_pos;
|
||||
auto top = off + size;
|
||||
auto num = _num_allocs;
|
||||
|
||||
_file_pos = top;
|
||||
_buffer_ostream = { };
|
||||
_buf_pos = 0;
|
||||
_num_allocs = 0;
|
||||
|
||||
auto me = shared_from_this();
|
||||
assert(me.use_count() > 1);
|
||||
|
||||
auto out = buf.get_ostream();
|
||||
auto * p = buf.get_write();
|
||||
assert(std::count(p, p + 2 * sizeof(uint32_t), 0) == 2 * sizeof(uint32_t));
|
||||
|
||||
data_output out(p, p + buf.size());
|
||||
|
||||
auto header_size = 0;
|
||||
|
||||
if (off == 0) {
|
||||
// first block. write file header.
|
||||
write(out, segment_magic);
|
||||
write(out, _desc.ver);
|
||||
write(out, _desc.id);
|
||||
out.write(segment_magic);
|
||||
out.write(_desc.ver);
|
||||
out.write(_desc.id);
|
||||
crc32_nbo crc;
|
||||
crc.process(_desc.ver);
|
||||
crc.process<int32_t>(_desc.id & 0xffffffff);
|
||||
crc.process<int32_t>(_desc.id >> 32);
|
||||
write(out, crc.checksum());
|
||||
out.write(crc.checksum());
|
||||
header_size = descriptor_header_size;
|
||||
}
|
||||
|
||||
@@ -676,8 +679,8 @@ public:
|
||||
crc.process<int32_t>(_desc.id >> 32);
|
||||
crc.process(uint32_t(off + header_size));
|
||||
|
||||
write(out, uint32_t(_file_pos));
|
||||
write(out, crc.checksum());
|
||||
out.write(uint32_t(_file_pos));
|
||||
out.write(crc.checksum());
|
||||
|
||||
forget_schema_versions();
|
||||
|
||||
@@ -687,32 +690,25 @@ public:
|
||||
|
||||
// The write will be allowed to start now, but flush (below) must wait for not only this,
|
||||
// but all previous write/flush pairs.
|
||||
return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
|
||||
auto view = fragmented_temporary_buffer::view(buf);
|
||||
view.remove_suffix(buf.size_bytes() - size);
|
||||
assert(size == view.size_bytes());
|
||||
return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
|
||||
if (view.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return repeat([this, size, &off, &view] {
|
||||
return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable {
|
||||
auto written = make_lw_shared<size_t>(0);
|
||||
auto p = buf.get();
|
||||
return repeat([this, size, off, written, p]() mutable {
|
||||
auto&& priority_class = service::get_local_commitlog_priority();
|
||||
auto current = *view.begin();
|
||||
return _file.dma_write(off, current.data(), current.size(), priority_class).then_wrapped([this, size, &off, &view](future<size_t>&& f) {
|
||||
return _file.dma_write(off + *written, p + *written, size - *written, priority_class).then_wrapped([this, size, written](future<size_t>&& f) {
|
||||
try {
|
||||
auto bytes = std::get<0>(f.get());
|
||||
*written += bytes;
|
||||
_segment_manager->totals.bytes_written += bytes;
|
||||
_segment_manager->totals.total_size_on_disk += bytes;
|
||||
++_segment_manager->totals.cycle_count;
|
||||
if (bytes == view.size_bytes()) {
|
||||
if (*written == size) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
// gah, partial write. should always get here with dma chunk sized
|
||||
// "bytes", but lets make sure...
|
||||
bytes = align_down(bytes, alignment);
|
||||
off += bytes;
|
||||
view.remove_prefix(bytes);
|
||||
clogger.debug("Partial write {}: {}/{} bytes", *this, size - view.size_bytes(), size);
|
||||
clogger.debug("Partial write {}: {}/{} bytes", *this, *written, size);
|
||||
*written = align_down(*written, alignment);
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
// TODO: retry/ignore/fail/stop - optional behaviour in origin.
|
||||
// we fast-fail the whole commit.
|
||||
@@ -721,10 +717,10 @@ public:
|
||||
throw;
|
||||
}
|
||||
});
|
||||
});
|
||||
}).finally([this, buf = std::move(buf), size] {
|
||||
}).finally([this, buf = std::move(buf), size]() mutable {
|
||||
_segment_manager->release_buffer(std::move(buf));
|
||||
_segment_manager->notify_memory_written(size);
|
||||
});
|
||||
});
|
||||
}, [me, flush_after, top, rp] { // lambda instead of bind, so we keep "me" alive.
|
||||
assert(me->_pending_ops.has_operation(rp));
|
||||
return flush_after ? me->do_flush(top) : make_ready_future<sseg_ptr>(me);
|
||||
@@ -790,7 +786,7 @@ public:
|
||||
return finish_and_get_new(timeout).then([id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
|
||||
return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
|
||||
});
|
||||
} else if (!_buffer.empty() && (s > _buffer_ostream.size())) { // enough data?
|
||||
} else if (!_buffer.empty() && (s > (_buffer.size() - _buf_pos))) { // enough data?
|
||||
if (_segment_manager->cfg.mode == sync_mode::BATCH) {
|
||||
// TODO: this could cause starvation if we're really unlucky.
|
||||
// If we run batch mode and find ourselves not fit in a non-empty
|
||||
@@ -809,7 +805,7 @@ public:
|
||||
size_t buf_memory = s;
|
||||
if (_buffer.empty()) {
|
||||
new_buffer(s);
|
||||
buf_memory += buffer_position();
|
||||
buf_memory += _buf_pos;
|
||||
}
|
||||
|
||||
_gate.enter(); // this might throw. I guess we accept this?
|
||||
@@ -817,26 +813,29 @@ public:
|
||||
_segment_manager->account_memory_usage(buf_memory);
|
||||
|
||||
replay_position rp(_desc.id, position());
|
||||
auto pos = _buf_pos;
|
||||
_buf_pos += s;
|
||||
_cf_dirty[id]++; // increase use count for cf.
|
||||
|
||||
rp_handle h(static_pointer_cast<cf_holder>(shared_from_this()), std::move(id), rp);
|
||||
|
||||
auto out = _buffer_ostream.write_substream(s);
|
||||
auto * p = _buffer.get_write() + pos;
|
||||
auto * e = _buffer.get_write() + pos + s - sizeof(uint32_t);
|
||||
|
||||
data_output out(p, e);
|
||||
crc32_nbo crc;
|
||||
|
||||
write<uint32_t>(out, s);
|
||||
out.write(uint32_t(s));
|
||||
crc.process(uint32_t(s));
|
||||
write<uint32_t>(out, crc.checksum());
|
||||
out.write(crc.checksum());
|
||||
|
||||
// actual data
|
||||
auto entry_out = out.write_substream(size);
|
||||
auto entry_data = entry_out.to_input_stream();
|
||||
writer->write(*this, entry_out);
|
||||
entry_data.with_stream([&] (auto data_str) {
|
||||
crc.process_fragmented(ser::buffer_view<typename std::vector<temporary_buffer<char>>::iterator>(data_str));
|
||||
});
|
||||
writer->write(*this, out);
|
||||
|
||||
write<uint32_t>(out, crc.checksum());
|
||||
crc.process_bytes(p + 2 * sizeof(uint32_t), size);
|
||||
|
||||
out = data_output(e, sizeof(uint32_t));
|
||||
out.write(crc.checksum());
|
||||
|
||||
++_segment_manager->totals.allocation_count;
|
||||
++_num_allocs;
|
||||
@@ -851,7 +850,7 @@ public:
|
||||
// If this buffer alone is too big, potentially bigger than the maximum allowed size,
|
||||
// then no other request will be allowed in to force the cycle()ing of this buffer. We
|
||||
// have to do it ourselves.
|
||||
if ((buffer_position() >= (db::commitlog::segment::default_size))) {
|
||||
if ((_buf_pos >= (db::commitlog::segment::default_size))) {
|
||||
cycle().discard_result().handle_exception([] (auto ex) {
|
||||
clogger.error("Failed to flush commits to disk: {}", ex);
|
||||
});
|
||||
@@ -861,7 +860,7 @@ public:
|
||||
}
|
||||
|
||||
position_type position() const {
|
||||
return position_type(_file_pos + buffer_position());
|
||||
return position_type(_file_pos + _buf_pos);
|
||||
}
|
||||
|
||||
size_t size_on_disk() const {
|
||||
@@ -871,12 +870,11 @@ public:
|
||||
// ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
|
||||
// a.k.a. zero the tail.
|
||||
size_t clear_buffer_slack() {
|
||||
auto buf_pos = buffer_position();
|
||||
auto size = align_up(buf_pos, alignment);
|
||||
auto fill_size = size - buf_pos;
|
||||
_buffer_ostream.fill('\0', fill_size);
|
||||
_segment_manager->totals.bytes_slack += fill_size;
|
||||
_segment_manager->account_memory_usage(fill_size);
|
||||
auto size = align_up(_buf_pos, alignment);
|
||||
std::fill(_buffer.get_write() + _buf_pos, _buffer.get_write() + size,
|
||||
0);
|
||||
_segment_manager->totals.bytes_slack += (size - _buf_pos);
|
||||
_segment_manager->account_memory_usage(size - _buf_pos);
|
||||
return size;
|
||||
}
|
||||
void mark_clean(const cf_id_type& id, uint64_t count) {
|
||||
@@ -1189,34 +1187,6 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Helper for ensuring a file is closed if an exception is thrown.
|
||||
///
|
||||
/// The file provided by the file_fut future is passed to func.
|
||||
/// * If func throws an exception E, the file is closed and we return
|
||||
/// a failed future with E.
|
||||
/// * If func returns a value V, the file is not closed and we return
|
||||
/// a future with V.
|
||||
/// Note that when an exception is not thrown, it is the
|
||||
/// responsibility of func to make sure the file will be closed. It
|
||||
/// can close the file itself, return it, or store it somewhere.
|
||||
///
|
||||
/// \tparam Func The type of function this wraps
|
||||
/// \param file_fut A future that produces a file
|
||||
/// \param func A function that uses a file
|
||||
/// \return A future that passes the file produced by file_fut to func
|
||||
/// and closes it if func fails
|
||||
template <typename Func>
|
||||
static auto close_on_failure(future<file> file_fut, Func func) {
|
||||
return file_fut.then([func = std::move(func)](file f) {
|
||||
return futurize_apply(func, f).handle_exception([f] (std::exception_ptr e) mutable {
|
||||
return f.close().then_wrapped([f, e = std::move(e)] (future<> x) {
|
||||
using futurator = futurize<std::result_of_t<Func(file)>>;
|
||||
return futurator::make_exception_future(e);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
|
||||
static const auto flags = open_flags::wo | open_flags::create;
|
||||
|
||||
@@ -1247,7 +1217,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
return fut;
|
||||
});
|
||||
|
||||
return close_on_failure(std::move(fut), [this, d, active, filename] (file f) {
|
||||
return fut.then([this, d, active, filename](file f) {
|
||||
f = make_checked_file(commit_error_handler, f);
|
||||
// xfs doesn't like files extended betond eof, so enlarge the file
|
||||
return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
|
||||
@@ -1544,20 +1514,41 @@ uint64_t db::commitlog::segment_manager::get_num_active_segments() const {
|
||||
|
||||
|
||||
db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acquire_buffer(size_t s) {
|
||||
s = align_up(s, segment::default_size);
|
||||
auto fragment_count = s / segment::default_size;
|
||||
auto i = _temp_buffers.begin();
|
||||
auto e = _temp_buffers.end();
|
||||
|
||||
std::vector<temporary_buffer<char>> buffers;
|
||||
buffers.reserve(fragment_count);
|
||||
while (buffers.size() < fragment_count) {
|
||||
auto a = ::memalign(segment::alignment, segment::default_size);
|
||||
if (a == nullptr) {
|
||||
throw std::bad_alloc();
|
||||
while (i != e) {
|
||||
if (i->size() >= s) {
|
||||
auto r = std::move(*i);
|
||||
_temp_buffers.erase(i);
|
||||
totals.buffer_list_bytes -= r.size();
|
||||
return r;
|
||||
}
|
||||
buffers.emplace_back(static_cast<char*>(a), segment::default_size, make_free_deleter(a));
|
||||
++i;
|
||||
}
|
||||
auto a = ::memalign(segment::alignment, s);
|
||||
if (a == nullptr) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
clogger.trace("Allocated {} k buffer", s / 1024);
|
||||
return fragmented_temporary_buffer(std::move(buffers), s);
|
||||
return buffer_type(reinterpret_cast<char *>(a), s, make_free_deleter(a));
|
||||
}
|
||||
|
||||
void db::commitlog::segment_manager::release_buffer(buffer_type&& b) {
|
||||
_temp_buffers.emplace_back(std::move(b));
|
||||
std::sort(_temp_buffers.begin(), _temp_buffers.end(), [](const buffer_type& b1, const buffer_type& b2) {
|
||||
return b1.size() < b2.size();
|
||||
});
|
||||
|
||||
constexpr const size_t max_temp_buffers = 4;
|
||||
|
||||
if (_temp_buffers.size() > max_temp_buffers) {
|
||||
clogger.trace("Deleting {} buffers", _temp_buffers.size() - max_temp_buffers);
|
||||
_temp_buffers.erase(_temp_buffers.begin() + max_temp_buffers, _temp_buffers.end());
|
||||
}
|
||||
totals.buffer_list_bytes = boost::accumulate(
|
||||
_temp_buffers | boost::adaptors::transformed(std::mem_fn(&buffer_type::size)),
|
||||
size_t(0), std::plus<size_t>());
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1703,14 +1694,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
|
||||
// No commit_io_check needed in the log reader since the database will fail
|
||||
// on error at startup if required
|
||||
future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
|
||||
db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||
db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||
struct work {
|
||||
private:
|
||||
file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
|
||||
file_input_stream_options make_file_input_stream_options() {
|
||||
file_input_stream_options fo;
|
||||
fo.buffer_size = db::commitlog::segment::default_size;
|
||||
fo.read_ahead = 10;
|
||||
fo.io_priority_class = read_io_prio_class;
|
||||
fo.io_priority_class = service::get_local_commitlog_priority();
|
||||
return fo;
|
||||
}
|
||||
public:
|
||||
@@ -1729,8 +1720,8 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
|
||||
bool header = true;
|
||||
bool failed = false;
|
||||
|
||||
work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
|
||||
work(file f, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
|
||||
}
|
||||
work(work&&) = default;
|
||||
|
||||
@@ -1785,7 +1776,7 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
|
||||
}
|
||||
|
||||
if (magic != segment::segment_magic) {
|
||||
throw invalid_segment_format();
|
||||
throw std::invalid_argument("Not a scylla format commitlog file");
|
||||
}
|
||||
crc32_nbo crc;
|
||||
crc.process(ver);
|
||||
@@ -1794,7 +1785,7 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
|
||||
|
||||
auto cs = crc.checksum();
|
||||
if (cs != checksum) {
|
||||
throw header_checksum_error();
|
||||
throw std::runtime_error("Checksum error in file header");
|
||||
}
|
||||
|
||||
this->id = id;
|
||||
@@ -1948,9 +1939,9 @@ db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class
|
||||
return fut;
|
||||
});
|
||||
|
||||
return fut.then([off, next, read_io_prio_class] (file f) {
|
||||
return fut.then([off, next](file f) {
|
||||
f = make_checked_file(commit_error_handler, std::move(f));
|
||||
auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
|
||||
auto w = make_lw_shared<work>(std::move(f), off);
|
||||
auto ret = w->s.listen(next);
|
||||
|
||||
w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "utils/data_output.hh"
|
||||
#include "core/future.hh"
|
||||
#include "core/shared_ptr.hh"
|
||||
#include "core/stream.hh"
|
||||
@@ -175,7 +176,7 @@ public:
|
||||
* of data to be written. (See add).
|
||||
* Don't write less, absolutely don't write more...
|
||||
*/
|
||||
using output = fragmented_temporary_buffer::ostream;
|
||||
using output = data_output;
|
||||
using serializer_func = std::function<void(output&)>;
|
||||
|
||||
/**
|
||||
@@ -342,42 +343,20 @@ public:
|
||||
|
||||
typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;
|
||||
|
||||
class segment_error : public std::exception {};
|
||||
|
||||
class segment_data_corruption_error: public segment_error {
|
||||
std::string _msg;
|
||||
class segment_data_corruption_error: public std::runtime_error {
|
||||
public:
|
||||
segment_data_corruption_error(std::string msg, uint64_t s)
|
||||
: _msg(std::move(msg)), _bytes(s) {
|
||||
: std::runtime_error(msg), _bytes(s) {
|
||||
}
|
||||
uint64_t bytes() const {
|
||||
return _bytes;
|
||||
}
|
||||
virtual const char* what() const noexcept {
|
||||
return _msg.c_str();
|
||||
}
|
||||
private:
|
||||
uint64_t _bytes;
|
||||
};
|
||||
|
||||
class invalid_segment_format : public segment_error {
|
||||
static constexpr const char* _msg = "Not a scylla format commitlog file";
|
||||
public:
|
||||
virtual const char* what() const noexcept {
|
||||
return _msg;
|
||||
}
|
||||
};
|
||||
|
||||
class header_checksum_error : public segment_error {
|
||||
static constexpr const char* _msg = "Checksum error in file header";
|
||||
public:
|
||||
virtual const char* what() const noexcept {
|
||||
return _msg;
|
||||
}
|
||||
};
|
||||
|
||||
static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
|
||||
const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
private:
|
||||
commitlog(config);
|
||||
|
||||
|
||||
@@ -51,8 +51,9 @@ void commitlog_entry_writer::compute_size() {
|
||||
_size = ms.size();
|
||||
}
|
||||
|
||||
void commitlog_entry_writer::write(typename seastar::memory_output_stream<std::vector<temporary_buffer<char>>::iterator>& out) const {
|
||||
serialize(out);
|
||||
void commitlog_entry_writer::write(data_output& out) const {
|
||||
seastar::simple_output_stream str(out.reserve(size()), size());
|
||||
serialize(str);
|
||||
}
|
||||
|
||||
commitlog_entry_reader::commitlog_entry_reader(const temporary_buffer<char>& buffer)
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
|
||||
#include "frozen_mutation.hh"
|
||||
#include "schema.hh"
|
||||
#include "utils/data_output.hh"
|
||||
#include "stdx.hh"
|
||||
|
||||
class commitlog_entry {
|
||||
@@ -34,8 +35,7 @@ public:
|
||||
commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
|
||||
: _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
|
||||
const stdx::optional<column_mapping>& mapping() const { return _mapping; }
|
||||
const frozen_mutation& mutation() const & { return _mutation; }
|
||||
frozen_mutation&& mutation() && { return std::move(_mutation); }
|
||||
const frozen_mutation& mutation() const { return _mutation; }
|
||||
};
|
||||
|
||||
class commitlog_entry_writer {
|
||||
@@ -72,7 +72,7 @@ public:
|
||||
return _mutation.representation().size();
|
||||
}
|
||||
|
||||
void write(typename seastar::memory_output_stream<std::vector<temporary_buffer<char>>::iterator>& out) const;
|
||||
void write(data_output& out) const;
|
||||
};
|
||||
|
||||
class commitlog_entry_reader {
|
||||
@@ -81,6 +81,5 @@ public:
|
||||
commitlog_entry_reader(const temporary_buffer<char>& buffer);
|
||||
|
||||
const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
|
||||
const frozen_mutation& mutation() const & { return _ce.mutation(); }
|
||||
frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
|
||||
const frozen_mutation& mutation() const { return _ce.mutation(); }
|
||||
};
|
||||
|
||||
@@ -58,7 +58,6 @@
|
||||
#include "converting_mutation_partition_applier.hh"
|
||||
#include "schema_registry.hh"
|
||||
#include "commitlog_entry.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
|
||||
static logging::logger rlogger("commitlog_replayer");
|
||||
|
||||
@@ -224,7 +223,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
|
||||
auto s = make_lw_shared<stats>();
|
||||
auto& exts = _qp.local().db().local().get_config().extensions();
|
||||
|
||||
return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
|
||||
return db::commitlog::read_log_file(file,
|
||||
std::bind(&impl::process, this, s.get(), std::placeholders::_1,
|
||||
std::placeholders::_2), p, &exts).then([](auto s) {
|
||||
auto f = s->done();
|
||||
|
||||
@@ -102,8 +102,6 @@ db::config::config()
|
||||
db::config::~config()
|
||||
{}
|
||||
|
||||
const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
|
||||
|
||||
namespace utils {
|
||||
|
||||
template<>
|
||||
|
||||
19
db/config.hh
19
db/config.hh
@@ -155,9 +155,6 @@ public:
|
||||
val(hints_directory, sstring, "/var/lib/scylla/hints", Used, \
|
||||
"The directory where hints files are stored if hinted handoff is enabled." \
|
||||
) \
|
||||
val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used, \
|
||||
"The directory where materialized-view updates are stored while a view replica is unreachable." \
|
||||
) \
|
||||
val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
|
||||
"The directory location where table key and row caches are stored." \
|
||||
) \
|
||||
@@ -456,7 +453,7 @@ public:
|
||||
"The maximum number of tombstones a query can scan before aborting." \
|
||||
) \
|
||||
/* Network timeout settings */ \
|
||||
val(range_request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
val(range_request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
"The time in milliseconds that the coordinator waits for sequential or index scans to complete." \
|
||||
) \
|
||||
val(read_request_timeout_in_ms, uint32_t, 5000, Used, \
|
||||
@@ -475,7 +472,7 @@ public:
|
||||
"The time in milliseconds that the coordinator waits for write operations to complete.\n" \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
val(request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
val(request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
"The default timeout for other, miscellaneous operations.\n" \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
@@ -581,8 +578,8 @@ public:
|
||||
val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused, \
|
||||
"The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval." \
|
||||
) \
|
||||
val(hinted_handoff_enabled, sstring, "true", Used, \
|
||||
"Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
|
||||
val(hinted_handoff_enabled, sstring, "false", Used, \
|
||||
"Experimental: enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
val(hinted_handoff_throttle_in_kb, uint32_t, 1024, Unused, \
|
||||
@@ -624,7 +621,7 @@ public:
|
||||
val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused, \
|
||||
"Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting." \
|
||||
) \
|
||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Used, \
|
||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Unused, \
|
||||
"The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)." \
|
||||
) \
|
||||
/* Security properties */ \
|
||||
@@ -731,7 +728,7 @@ public:
|
||||
val(prometheus_address, sstring, "0.0.0.0", Used, "Prometheus listening address") \
|
||||
val(prometheus_prefix, sstring, "scylla", Used, "Set the prefix of the exported Prometheus metrics. Changing this will break Scylla's dashboard compatibility, do not change unless you know what you are doing.") \
|
||||
val(abort_on_lsa_bad_alloc, bool, false, Used, "Abort when allocation in LSA region fails") \
|
||||
val(murmur3_partitioner_ignore_msb_bits, unsigned, 12, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
|
||||
val(murmur3_partitioner_ignore_msb_bits, unsigned, 0, Used, "Number of most siginificant token bits to ignore in murmur3 partitioner; increase for very large clusters") \
|
||||
val(virtual_dirty_soft_limit, double, 0.6, Used, "Soft limit of virtual dirty memory expressed as a portion of the hard limit") \
|
||||
val(sstable_summary_ratio, double, 0.0005, Used, "Enforces that 1 byte of summary is written for every N (2000 by default) " \
|
||||
"bytes written to data file. Value must be between 0 and 1.") \
|
||||
@@ -742,8 +739,6 @@ public:
|
||||
" Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
|
||||
val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
|
||||
val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
|
||||
val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
|
||||
val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
|
||||
/* done! */
|
||||
|
||||
#define _make_value_member(name, type, deflt, status, desc, ...) \
|
||||
@@ -757,8 +752,6 @@ public:
|
||||
add_options(boost::program_options::options_description_easy_init&);
|
||||
|
||||
const db::extensions& extensions() const;
|
||||
|
||||
static const sstring default_tls_priority;
|
||||
private:
|
||||
template<typename T>
|
||||
struct log_legacy_value : public named_value<T, value_status::Used> {
|
||||
|
||||
@@ -253,12 +253,8 @@ filter_for_query(consistency_level cl,
|
||||
return selected_endpoints;
|
||||
}
|
||||
|
||||
std::vector<gms::inet_address> filter_for_query(consistency_level cl,
|
||||
keyspace& ks,
|
||||
std::vector<gms::inet_address>& live_endpoints,
|
||||
const std::vector<gms::inet_address>& preferred_endpoints,
|
||||
column_family* cf) {
|
||||
return filter_for_query(cl, ks, live_endpoints, preferred_endpoints, read_repair_decision::NONE, nullptr, cf);
|
||||
std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints, column_family* cf) {
|
||||
return filter_for_query(cl, ks, live_endpoints, {}, read_repair_decision::NONE, nullptr, cf);
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
@@ -84,11 +84,7 @@ filter_for_query(consistency_level cl,
|
||||
gms::inet_address* extra,
|
||||
column_family* cf);
|
||||
|
||||
std::vector<gms::inet_address> filter_for_query(consistency_level cl,
|
||||
keyspace& ks,
|
||||
std::vector<gms::inet_address>& live_endpoints,
|
||||
const std::vector<gms::inet_address>& preferred_endpoints,
|
||||
column_family* cf);
|
||||
std::vector<gms::inet_address> filter_for_query(consistency_level cl, keyspace& ks, std::vector<gms::inet_address>& live_endpoints, column_family* cf);
|
||||
|
||||
struct dc_node_count {
|
||||
size_t live = 0;
|
||||
|
||||
@@ -49,10 +49,7 @@
|
||||
#include "types.hh"
|
||||
|
||||
static ::shared_ptr<cql3::cql3_type::raw> parse_raw(const sstring& str) {
|
||||
return cql3::util::do_with_parser(str,
|
||||
[] (cql3_parser::CqlParser& parser) {
|
||||
return parser.comparator_type(true);
|
||||
});
|
||||
return cql3::util::do_with_parser(str, std::mem_fn(&cql3_parser::CqlParser::comparatorType));
|
||||
}
|
||||
|
||||
data_type db::cql_type_parser::parse(const sstring& keyspace, const sstring& str, lw_shared_ptr<user_types_metadata> user_types) {
|
||||
|
||||
@@ -28,7 +28,8 @@ logging::logger hr_logger("heat_load_balance");
|
||||
// Return a uniformly-distributed random number in [0,1)
|
||||
// We use per-thread state for thread safety. We seed the random number generator
|
||||
// once with a real random value, if available,
|
||||
static thread_local std::default_random_engine random_engine{std::random_device{}()};
|
||||
static thread_local std::random_device r;
|
||||
static thread_local std::default_random_engine random_engine(r());
|
||||
float
|
||||
rand_float() {
|
||||
static thread_local std::uniform_real_distribution<float> u(0, 1);
|
||||
|
||||
@@ -20,11 +20,9 @@
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/seastar.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include "service/storage_service.hh"
|
||||
#include "utils/div_ceil.hh"
|
||||
#include "db/config.hh"
|
||||
@@ -35,9 +33,6 @@
|
||||
#include "disk-error-handler.hh"
|
||||
#include "lister.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
|
||||
using namespace std::literals::chrono_literals;
|
||||
|
||||
namespace db {
|
||||
namespace hints {
|
||||
@@ -79,12 +74,6 @@ void manager::register_metrics(const sstring& group_name) {
|
||||
|
||||
sm::make_derive("sent", _stats.sent,
|
||||
sm::description("Number of sent hints.")),
|
||||
|
||||
sm::make_derive("discarded", _stats.discarded,
|
||||
sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
|
||||
|
||||
sm::make_derive("corrupted_files", _stats.corrupted_files,
|
||||
sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -102,7 +91,6 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
|
||||
return compute_hints_dir_device_id();
|
||||
}).then([this] {
|
||||
_strorage_service_anchor->register_subscriber(this);
|
||||
set_started();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -113,12 +101,12 @@ future<> manager::stop() {
|
||||
_strorage_service_anchor->unregister_subscriber(this);
|
||||
}
|
||||
|
||||
set_stopping();
|
||||
_stopping = true;
|
||||
|
||||
return _draining_eps_gate.close().finally([this] {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop();
|
||||
}).finally([this] {
|
||||
return pair.second.stop();
|
||||
}).finally([this] {
|
||||
_ep_managers.clear();
|
||||
manager_logger.info("Stopped");
|
||||
}).discard_result();
|
||||
@@ -239,8 +227,6 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
|
||||
manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
|
||||
: _key(key)
|
||||
, _shard_manager(shard_manager)
|
||||
, _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
|
||||
, _file_update_mutex(*_file_update_mutex_ptr)
|
||||
, _state(state_set::of<state::stopped>())
|
||||
, _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
|
||||
, _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
|
||||
@@ -249,8 +235,6 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
|
||||
manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
|
||||
: _key(other._key)
|
||||
, _shard_manager(other._shard_manager)
|
||||
, _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
|
||||
, _file_update_mutex(*_file_update_mutex_ptr)
|
||||
, _state(other._state)
|
||||
, _hints_dir(std::move(other._hints_dir))
|
||||
, _sender(other._sender, *this)
|
||||
@@ -289,7 +273,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
|
||||
if (stopping() || !started() || !can_hint_for(ep)) {
|
||||
if (_stopping || !can_hint_for(ep)) {
|
||||
manager_logger.trace("Can't store a hint to {}", ep);
|
||||
++_stats.dropped;
|
||||
return false;
|
||||
@@ -392,7 +376,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
|
||||
});
|
||||
}
|
||||
|
||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||
return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
|
||||
// The fact that we send with CL::ALL in both cases below ensures that new hints are not going
|
||||
// to be generated as a result of hints sending.
|
||||
@@ -401,11 +385,7 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_m
|
||||
return _proxy.send_to_endpoint(std::move(m), end_point_key(), { }, write_type::SIMPLE);
|
||||
} else {
|
||||
manager_logger.trace("Endpoints set has changed and {} is no longer a replica. Mutating from scratch...", end_point_key());
|
||||
// FIXME: using 1h as infinite timeout. If a node is down, we should get an
|
||||
// unavailable exception.
|
||||
auto timeout = db::timeout_clock::now() + 1h;
|
||||
//FIXME: Add required frozen_mutation overloads
|
||||
return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
|
||||
return _proxy.mutate({std::move(m)}, consistency_level::ALL, nullptr);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -431,19 +411,21 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||
mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||
hint_entry_reader hr(buf);
|
||||
auto& fm = hr.mutation();
|
||||
auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
|
||||
auto schema = _db.find_schema(fm.column_family_id());
|
||||
auto& cf = _db.find_column_family(fm.column_family_id());
|
||||
|
||||
if (schema->version() != fm.schema_version()) {
|
||||
mutation m(schema, fm.decorated_key(*schema));
|
||||
converting_mutation_partition_applier v(cm, *schema, m.partition());
|
||||
if (cf.schema()->version() != fm.schema_version()) {
|
||||
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
|
||||
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
|
||||
fm.partition().accept(cm, v);
|
||||
return {freeze(m), std::move(schema)};
|
||||
|
||||
return std::move(m);
|
||||
} else {
|
||||
return fm.unfreeze(cf.schema());
|
||||
}
|
||||
return {std::move(hr).mutation(), std::move(schema)};
|
||||
}
|
||||
|
||||
const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
|
||||
@@ -513,42 +495,35 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
void manager::drain_for(gms::inet_address endpoint) {
|
||||
if (stopping()) {
|
||||
if (_stopping) {
|
||||
return;
|
||||
}
|
||||
|
||||
manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);
|
||||
|
||||
with_gate(_draining_eps_gate, [this, endpoint] {
|
||||
return with_semaphore(drain_lock(), 1, [this, endpoint] {
|
||||
return futurize_apply([this, endpoint] () {
|
||||
if (utils::fb_utilities::is_me(endpoint)) {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop(drain::yes).finally([&pair] {
|
||||
return with_file_update_mutex(pair.second, [&pair] {
|
||||
return remove_file(pair.second.hints_dir().c_str());
|
||||
});
|
||||
});
|
||||
}).finally([this] {
|
||||
_ep_managers.clear();
|
||||
return futurize_apply([this, endpoint] () {
|
||||
if (utils::fb_utilities::is_me(endpoint)) {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop(drain::yes).finally([&pair] {
|
||||
return remove_file(pair.second.hints_dir().c_str());
|
||||
});
|
||||
}).finally([this] {
|
||||
_ep_managers.clear();
|
||||
});
|
||||
} else {
|
||||
ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
|
||||
if (ep_manager_it != ep_managers_end()) {
|
||||
return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
|
||||
_ep_managers.erase(endpoint);
|
||||
return remove_file(hints_dir.c_str());
|
||||
});
|
||||
} else {
|
||||
ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
|
||||
if (ep_manager_it != ep_managers_end()) {
|
||||
return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
|
||||
return with_file_update_mutex(ep_man, [&ep_man] {
|
||||
return remove_file(ep_man.hints_dir().c_str());
|
||||
}).finally([this, endpoint] {
|
||||
_ep_managers.erase(endpoint);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}).handle_exception([endpoint] (auto eptr) {
|
||||
manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
|
||||
});
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}).handle_exception([endpoint] (auto eptr) {
|
||||
manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -561,7 +536,6 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(local_storage_proxy)
|
||||
, _db(local_db)
|
||||
, _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
|
||||
, _gossiper(local_gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
@@ -574,7 +548,6 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(other._proxy)
|
||||
, _db(other._db)
|
||||
, _hints_cpu_sched_group(other._hints_cpu_sched_group)
|
||||
, _gossiper(other._gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
@@ -630,10 +603,7 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
|
||||
}
|
||||
|
||||
void manager::end_point_hints_manager::sender::start() {
|
||||
seastar::thread_attributes attr;
|
||||
|
||||
attr.sched_group = _hints_cpu_sched_group;
|
||||
_stopped = seastar::async(std::move(attr), [this] {
|
||||
_stopped = seastar::async([this] {
|
||||
manager_logger.trace("ep_manager({})::sender: started", end_point_key());
|
||||
while (!stopping()) {
|
||||
try {
|
||||
@@ -653,11 +623,10 @@ void manager::end_point_hints_manager::sender::start() {
|
||||
});
|
||||
}
|
||||
|
||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
|
||||
keyspace& ks = _db.find_keyspace(m.s->ks_name());
|
||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
|
||||
keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
|
||||
auto& rs = ks.get_replication_strategy();
|
||||
auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
|
||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));
|
||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
|
||||
|
||||
return do_send_one_mutation(std::move(m), natural_endpoints);
|
||||
}
|
||||
@@ -675,8 +644,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto m = this->get_mutation(ctx_ptr, buf);
|
||||
gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();
|
||||
mutation m = this->get_mutation(ctx_ptr, buf);
|
||||
gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
|
||||
|
||||
// The hint is too old - drop it.
|
||||
//
|
||||
@@ -697,13 +666,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
// ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
|
||||
} catch (no_such_column_family& e) {
|
||||
manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
|
||||
++this->shard_stats().discarded;
|
||||
} catch (no_such_keyspace& e) {
|
||||
manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
|
||||
++this->shard_stats().discarded;
|
||||
} catch (no_column_mapping& e) {
|
||||
manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
|
||||
++this->shard_stats().discarded;
|
||||
manager_logger.debug("send_hints(): {}: {}", fname, e.what());
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).finally([units = std::move(units), ctx_ptr] {});
|
||||
@@ -717,10 +683,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
|
||||
timespec last_mod = get_last_file_modification(fname).get0();
|
||||
gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
|
||||
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);
|
||||
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
|
||||
|
||||
try {
|
||||
auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||
auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||
// Check that we can still send the next hint. Don't try to send it if the destination host
|
||||
// is DOWN or if we have already failed to send some of the previous hints.
|
||||
if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
|
||||
@@ -739,10 +705,6 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
||||
}, _last_not_complete_rp.pos, &_db.get_config().extensions()).get0();
|
||||
|
||||
s->done().get();
|
||||
} catch (db::commitlog::segment_error& ex) {
|
||||
manager_logger.error("{}: {}. Dropping...", fname, ex.what());
|
||||
ctx_ptr->state.remove(send_state::segment_replay_failed);
|
||||
++this->shard_stats().corrupted_files;
|
||||
} catch (...) {
|
||||
manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
|
||||
ctx_ptr->state.set(send_state::segment_replay_failed);
|
||||
@@ -778,7 +740,6 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
||||
|
||||
// clear the replay position - we are going to send the next segment...
|
||||
_last_not_complete_rp = replay_position();
|
||||
_last_schema_ver_to_column_mapping.clear();
|
||||
manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
|
||||
return true;
|
||||
}
|
||||
@@ -791,7 +752,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
||||
int replayed_segments_count = 0;
|
||||
|
||||
try {
|
||||
while (replay_allowed() && have_segments()) {
|
||||
while (have_segments()) {
|
||||
if (!send_one_file(*_segments_to_replay.begin())) {
|
||||
break;
|
||||
}
|
||||
@@ -816,173 +777,5 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
||||
manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
|
||||
return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
|
||||
try {
|
||||
return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
|
||||
} catch (std::invalid_argument& ex) {
|
||||
manager_logger.debug("Ignore invalid directory {}", de.name);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// runs in seastar::async context
|
||||
manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
|
||||
hints_segments_map current_hints_segments;
|
||||
|
||||
// shards level
|
||||
scan_for_hints_dirs(hints_directory, [¤t_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||
manager_logger.trace("shard_id = {}", shard_id);
|
||||
// IPs level
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [¤t_hints_segments, shard_id] (lister::path dir, directory_entry de) {
|
||||
manager_logger.trace("\tIP: {}", de.name);
|
||||
// hints files
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::regular }, [¤t_hints_segments, shard_id, ep_addr = de.name] (lister::path dir, directory_entry de) {
|
||||
manager_logger.trace("\t\tfile: {}", de.name);
|
||||
current_hints_segments[ep_addr][shard_id].emplace_back(dir / de.name.c_str());
|
||||
return make_ready_future<>();
|
||||
});
|
||||
});
|
||||
}).get();
|
||||
|
||||
return current_hints_segments;
|
||||
}
|
||||
|
||||
// runs in seastar::async context
|
||||
void manager::rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map) {
|
||||
// Count how many hints segments to each destination we have.
|
||||
std::unordered_map<sstring, size_t> per_ep_hints;
|
||||
for (auto& ep_info : segments_map) {
|
||||
per_ep_hints[ep_info.first] = boost::accumulate(ep_info.second | boost::adaptors::map_values | boost::adaptors::transformed(std::mem_fn(&std::list<lister::path>::size)), 0);
|
||||
manager_logger.trace("{}: total files: {}", ep_info.first, per_ep_hints[ep_info.first]);
|
||||
}
|
||||
|
||||
// Create a map of lists of segments that we will move (for each destination end point): if a shard has segments
|
||||
// then we will NOT move q = int(N/S) segments out of them, where N is a total number of segments to the current
|
||||
// destination and S is a current number of shards.
|
||||
std::unordered_map<sstring, std::list<lister::path>> segments_to_move;
|
||||
for (auto& [ep, ep_segments] : segments_map) {
|
||||
size_t q = per_ep_hints[ep] / smp::count;
|
||||
auto& current_segments_to_move = segments_to_move[ep];
|
||||
|
||||
for (auto& [shard_id, shard_segments] : ep_segments) {
|
||||
// Move all segments from the shards that are no longer relevant (re-sharding to the lower number of shards)
|
||||
if (shard_id >= smp::count) {
|
||||
current_segments_to_move.splice(current_segments_to_move.end(), shard_segments);
|
||||
} else if (shard_segments.size() > q) {
|
||||
current_segments_to_move.splice(current_segments_to_move.end(), shard_segments, std::next(shard_segments.begin(), q), shard_segments.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Since N (a total number of segments to a specific destination) may be not a multiple of S (a current number of
|
||||
// shards) we will distribute files in two passes:
|
||||
// * if N = S * q + r, then
|
||||
// * one pass for segments_per_shard = q
|
||||
// * another one for segments_per_shard = q + 1.
|
||||
//
|
||||
// This way we will ensure as close to the perfect distribution as possible.
|
||||
//
|
||||
// Right till this point we haven't moved any segments. However we have created a logical separation of segments
|
||||
// into two groups:
|
||||
// * Segments that are not going to be moved: segments in the segments_map.
|
||||
// * Segments that are going to be moved: segments in the segments_to_move.
|
||||
//
|
||||
// rebalance_segments_for() is going to consume segments from segments_to_move and move them to corresponding
|
||||
// lists in the segments_map AND actually move segments to the corresponding shard's sub-directory till the requested
|
||||
// segments_per_shard level is reached (see more details in the description of rebalance_segments_for()).
|
||||
for (auto& [ep, N] : per_ep_hints) {
|
||||
size_t q = N / smp::count;
|
||||
size_t r = N - q * smp::count;
|
||||
auto& current_segments_to_move = segments_to_move[ep];
|
||||
auto& current_segments_map = segments_map[ep];
|
||||
|
||||
if (q) {
|
||||
rebalance_segments_for(ep, q, hints_directory, current_segments_map, current_segments_to_move);
|
||||
}
|
||||
|
||||
if (r) {
|
||||
rebalance_segments_for(ep, q + 1, hints_directory, current_segments_map, current_segments_to_move);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runs in seastar::async context
|
||||
void manager::rebalance_segments_for(
|
||||
const sstring& ep,
|
||||
size_t segments_per_shard,
|
||||
const sstring& hints_directory,
|
||||
hints_ep_segments_map& ep_segments,
|
||||
std::list<lister::path>& segments_to_move)
|
||||
{
|
||||
manager_logger.trace("{}: segments_per_shard: {}, total number of segments to move: {}", ep, segments_per_shard, segments_to_move.size());
|
||||
|
||||
// sanity check
|
||||
if (segments_to_move.empty() || !segments_per_shard) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < smp::count && !segments_to_move.empty(); ++i) {
|
||||
lister::path shard_path_dir(lister::path(hints_directory.c_str()) / seastar::format("{:d}", i).c_str() / ep.c_str());
|
||||
std::list<lister::path>& current_shard_segments = ep_segments[i];
|
||||
|
||||
// Make sure that the shard_path_dir exists and if not - create it
|
||||
io_check(recursive_touch_directory, shard_path_dir.c_str()).get();
|
||||
|
||||
while (current_shard_segments.size() < segments_per_shard && !segments_to_move.empty()) {
|
||||
auto seg_path_it = segments_to_move.begin();
|
||||
lister::path new_path(shard_path_dir / seg_path_it->filename());
|
||||
|
||||
// Don't move the file to the same location - it's pointless.
|
||||
if (*seg_path_it != new_path) {
|
||||
manager_logger.trace("going to move: {} -> {}", *seg_path_it, new_path);
|
||||
io_check(rename_file, seg_path_it->native(), new_path.native()).get();
|
||||
} else {
|
||||
manager_logger.trace("skipping: {}", *seg_path_it);
|
||||
}
|
||||
current_shard_segments.splice(current_shard_segments.end(), segments_to_move, seg_path_it, std::next(seg_path_it));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runs in seastar::async context
|
||||
void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
|
||||
// shards level
|
||||
scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||
if (shard_id >= smp::count) {
|
||||
// IPs level
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
|
||||
return io_check(remove_file, (dir / de.name.c_str()).native());
|
||||
}).then([shard_base_dir = dir, shard_entry = de] {
|
||||
return io_check(remove_file, (shard_base_dir / shard_entry.name.c_str()).native());
|
||||
});
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
}
|
||||
|
||||
future<> manager::rebalance(sstring hints_directory) {
|
||||
return seastar::async([hints_directory = std::move(hints_directory)] {
|
||||
// Scan currently present hints segments.
|
||||
hints_segments_map current_hints_segments = get_current_hints_segments(hints_directory);
|
||||
|
||||
// Move segments to achieve an even distribution of files among all present shards.
|
||||
rebalance_segments(hints_directory, current_hints_segments);
|
||||
|
||||
// Remove the directories of shards that are not present anymore - they should not have any segments by now
|
||||
remove_irrelevant_shards_directories(hints_directory);
|
||||
});
|
||||
}
|
||||
|
||||
void manager::update_backlog(size_t backlog, size_t max_backlog) {
|
||||
if (backlog < max_backlog) {
|
||||
allow_hints();
|
||||
} else {
|
||||
forbid_hints_for_eps_with_pending_hints();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/core/lowres_clock.hh>
|
||||
#include <seastar/core/shared_mutex.hh>
|
||||
#include "lister.hh"
|
||||
#include "gms/gossiper.hh"
|
||||
#include "locator/snitch_base.hh"
|
||||
#include "service/endpoint_lifecycle_subscriber.hh"
|
||||
@@ -59,20 +58,11 @@ private:
|
||||
uint64_t errors = 0;
|
||||
uint64_t dropped = 0;
|
||||
uint64_t sent = 0;
|
||||
uint64_t discarded = 0;
|
||||
uint64_t corrupted_files = 0;
|
||||
};
|
||||
|
||||
// map: shard -> segments
|
||||
using hints_ep_segments_map = std::unordered_map<unsigned, std::list<lister::path>>;
|
||||
// map: IP -> map: shard -> segments
|
||||
using hints_segments_map = std::unordered_map<sstring, hints_ep_segments_map>;
|
||||
|
||||
class drain_tag {};
|
||||
using drain = seastar::bool_class<drain_tag>;
|
||||
|
||||
friend class space_watchdog;
|
||||
|
||||
public:
|
||||
class end_point_hints_manager {
|
||||
public:
|
||||
@@ -104,10 +94,7 @@ public:
|
||||
send_state::restart_segment>>;
|
||||
|
||||
struct send_one_file_ctx {
|
||||
send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
|
||||
: schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
|
||||
{}
|
||||
std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
|
||||
std::unordered_map<table_schema_version, column_mapping> schema_ver_to_column_mapping;
|
||||
seastar::gate file_send_gate;
|
||||
std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
|
||||
send_state_set state;
|
||||
@@ -116,7 +103,6 @@ public:
|
||||
private:
|
||||
std::list<sstring> _segments_to_replay;
|
||||
replay_position _last_not_complete_rp;
|
||||
std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
|
||||
state_set _state;
|
||||
future<> _stopped;
|
||||
clock::time_point _next_flush_tp;
|
||||
@@ -127,7 +113,6 @@ public:
|
||||
resource_manager& _resource_manager;
|
||||
service::storage_proxy& _proxy;
|
||||
database& _db;
|
||||
seastar::scheduling_group _hints_cpu_sched_group;
|
||||
gms::gossiper& _gossiper;
|
||||
seastar::shared_mutex& _file_update_mutex;
|
||||
|
||||
@@ -188,10 +173,6 @@ public:
|
||||
return _state.contains(state::stopping);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _ep_manager.replay_allowed();
|
||||
}
|
||||
|
||||
/// \brief Try to send one hint read from the file.
|
||||
/// - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
|
||||
/// - Discard the hints that are older than the grace seconds value of the corresponding table.
|
||||
@@ -223,7 +204,7 @@ public:
|
||||
/// \param ctx_ptr pointer to the send context
|
||||
/// \param buf hints file entry
|
||||
/// \return The mutation object representing the original mutation stored in the hints file.
|
||||
frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||
mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||
|
||||
/// \brief Get a reference to the column_mapping object for a given frozen mutation.
|
||||
/// \param ctx_ptr pointer to the send context
|
||||
@@ -240,13 +221,13 @@ public:
|
||||
/// \param m mutation to send
|
||||
/// \param natural_endpoints current replicas for the given mutation
|
||||
/// \return future that resolves when the operation is complete
|
||||
future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||
future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||
|
||||
/// \brief Send one mutation out.
|
||||
///
|
||||
/// \param m mutation to send
|
||||
/// \return future that resolves when the mutation sending processing is complete.
|
||||
future<> send_one_mutation(frozen_mutation_and_schema m);
|
||||
future<> send_one_mutation(mutation m);
|
||||
|
||||
/// \brief Get the last modification time stamp for a given file.
|
||||
/// \param fname File name
|
||||
@@ -275,8 +256,7 @@ public:
|
||||
manager& _shard_manager;
|
||||
hints_store_ptr _hints_store_anchor;
|
||||
seastar::gate _store_gate;
|
||||
lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
|
||||
seastar::shared_mutex& _file_update_mutex;
|
||||
seastar::shared_mutex _file_update_mutex;
|
||||
|
||||
enum class state {
|
||||
can_hint, // hinting is currently allowed (used by the space_watchdog)
|
||||
@@ -342,10 +322,6 @@ public:
|
||||
return _hints_in_progress;
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _shard_manager.replay_allowed();
|
||||
}
|
||||
|
||||
bool can_hint() const noexcept {
|
||||
return _state.contains(state::can_hint);
|
||||
}
|
||||
@@ -378,20 +354,8 @@ public:
|
||||
return _state.contains(state::stopped);
|
||||
}
|
||||
|
||||
/// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
|
||||
///
|
||||
/// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
|
||||
/// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
|
||||
/// (as long as the \ref func call itself is safe).
|
||||
///
|
||||
/// \tparam Func Functor type.
|
||||
/// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
|
||||
/// \param func Functor to run under the lock.
|
||||
/// \return Whatever \ref func returns.
|
||||
template <typename Func>
|
||||
friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
|
||||
lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
|
||||
return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
|
||||
seastar::shared_mutex& file_update_mutex() {
|
||||
return _file_update_mutex;
|
||||
}
|
||||
|
||||
const boost::filesystem::path& hints_dir() const noexcept {
|
||||
@@ -399,10 +363,6 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
seastar::shared_mutex& file_update_mutex() noexcept {
|
||||
return _file_update_mutex;
|
||||
}
|
||||
|
||||
/// \brief Creates a new hints store object.
|
||||
///
|
||||
/// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
|
||||
@@ -427,17 +387,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
enum class state {
|
||||
started, // hinting is currently allowed (start() call is complete)
|
||||
replay_allowed, // replaying (hints sending) is allowed
|
||||
stopping // hinting is not allowed - stopping is in progress (stop() method has been called)
|
||||
};
|
||||
|
||||
using state_set = enum_set<super_enum<state,
|
||||
state::started,
|
||||
state::replay_allowed,
|
||||
state::stopping>>;
|
||||
|
||||
private:
|
||||
using ep_key_type = typename end_point_hints_manager::key_type;
|
||||
using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
|
||||
@@ -448,7 +397,6 @@ public:
|
||||
static const std::chrono::seconds hint_file_write_timeout;
|
||||
|
||||
private:
|
||||
state_set _state;
|
||||
const boost::filesystem::path _hints_dir;
|
||||
dev_t _hints_dir_device_id = 0;
|
||||
|
||||
@@ -460,7 +408,7 @@ private:
|
||||
locator::snitch_ptr& _local_snitch_ptr;
|
||||
int64_t _max_hint_window_us = 0;
|
||||
database& _local_db;
|
||||
|
||||
bool _stopping = false;
|
||||
seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call
|
||||
|
||||
resource_manager& _resource_manager;
|
||||
@@ -469,13 +417,10 @@ private:
|
||||
stats _stats;
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
std::unordered_set<ep_key_type> _eps_with_pending_hints;
|
||||
seastar::semaphore _drain_lock = {1};
|
||||
|
||||
public:
|
||||
manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
|
||||
virtual ~manager();
|
||||
manager(manager&&) = delete;
|
||||
manager& operator=(manager&&) = delete;
|
||||
void register_metrics(const sstring& group_name);
|
||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
future<> stop();
|
||||
@@ -548,32 +493,15 @@ public:
|
||||
return _hints_dir_device_id;
|
||||
}
|
||||
|
||||
seastar::semaphore& drain_lock() noexcept {
|
||||
return _drain_lock;
|
||||
}
|
||||
|
||||
void allow_hints();
|
||||
void forbid_hints();
|
||||
void forbid_hints_for_eps_with_pending_hints();
|
||||
|
||||
void allow_replaying() noexcept {
|
||||
_state.set(state::replay_allowed);
|
||||
}
|
||||
|
||||
/// \brief Rebalance hints segments among all present shards.
|
||||
///
|
||||
/// The difference between the number of segments on every two shard will be not greater than 1 after the
|
||||
/// rebalancing.
|
||||
///
|
||||
/// Removes the sub-directories of \ref hints_directory that correspond to shards that are not relevant any more
|
||||
/// (re-sharding to a lower shards number case).
|
||||
///
|
||||
/// Complexity: O(N+K), where N is a total number of present hints' segments and
|
||||
/// K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
|
||||
///
|
||||
/// \param hints_directory A hints directory to rebalance
|
||||
/// \return A future that resolves when the operation is complete.
|
||||
static future<> rebalance(sstring hints_directory);
|
||||
static future<> rebalance() {
|
||||
// TODO
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual void on_join_cluster(const gms::inet_address& endpoint) override {}
|
||||
virtual void on_leave_cluster(const gms::inet_address& endpoint) override {
|
||||
@@ -581,68 +509,11 @@ public:
|
||||
};
|
||||
virtual void on_up(const gms::inet_address& endpoint) override {}
|
||||
virtual void on_down(const gms::inet_address& endpoint) override {}
|
||||
virtual void on_move(const gms::inet_address& endpoint) override {}
|
||||
|
||||
private:
|
||||
future<> compute_hints_dir_device_id();
|
||||
|
||||
/// \brief Scan the given hints directory and build the map of all present hints segments.
|
||||
///
|
||||
/// Complexity: O(N+K), where N is a total number of present hints' segments and
|
||||
/// K = <number of shards during the previous boot> * <number of end points for which hints where ever created>
|
||||
///
|
||||
/// \note Should be called from a seastar::thread context.
|
||||
///
|
||||
/// \param hints_directory directory to scan
|
||||
/// \return a map: ep -> map: shard -> segments (full paths)
|
||||
static hints_segments_map get_current_hints_segments(const sstring& hints_directory);
|
||||
|
||||
/// \brief Rebalance hints segments for a given (destination) end point
|
||||
///
|
||||
/// This method is going to consume files from the \ref segments_to_move and distribute them between the present
|
||||
/// shards (taking into an account the \ref ep_segments state - there may be zero or more segments that belong to a
|
||||
/// particular shard in it) until we either achieve the requested \ref segments_per_shard level on each shard
|
||||
/// or until we are out of files to move.
|
||||
///
|
||||
/// As a result (in addition to the actual state on the disk) both \ref ep_segments and \ref segments_to_move are going
|
||||
/// to be modified.
|
||||
///
|
||||
/// Complexity: O(N), where N is a total number of present hints' segments for the \ref ep end point (as a destination).
|
||||
///
|
||||
/// \note Should be called from a seastar::thread context.
|
||||
///
|
||||
/// \param ep destination end point ID (a string with its IP address)
|
||||
/// \param segments_per_shard number of hints segments per-shard we want to achieve
|
||||
/// \param hints_directory a root hints directory
|
||||
/// \param ep_segments a map that was originally built by get_current_hints_segments() for this end point
|
||||
/// \param segments_to_move a list of segments we are allowed to move
|
||||
static void rebalance_segments_for(
|
||||
const sstring& ep,
|
||||
size_t segments_per_shard,
|
||||
const sstring& hints_directory,
|
||||
hints_ep_segments_map& ep_segments,
|
||||
std::list<lister::path>& segments_to_move);
|
||||
|
||||
/// \brief Rebalance all present hints segments.
|
||||
///
|
||||
/// The difference between the number of segments on every two shard will be not greater than 1 after the
|
||||
/// rebalancing.
|
||||
///
|
||||
/// Complexity: O(N), where N is a total number of present hints' segments.
|
||||
///
|
||||
/// \note Should be called from a seastar::thread context.
|
||||
///
|
||||
/// \param hints_directory a root hints directory
|
||||
/// \param segments_map a map that was built by get_current_hints_segments()
|
||||
static void rebalance_segments(const sstring& hints_directory, hints_segments_map& segments_map);
|
||||
|
||||
/// \brief Remove sub-directories of shards that are not relevant any more (re-sharding to a lower number of shards case).
|
||||
///
|
||||
/// Complexity: O(S*E), where S is a number of shards during the previous boot and
|
||||
/// E is a number of end points for which hints where ever created.
|
||||
///
|
||||
/// \param hints_directory a root hints directory
|
||||
static void remove_irrelevant_shards_directories(const sstring& hints_directory);
|
||||
|
||||
node_to_hint_store_factory_type& store_factory() noexcept {
|
||||
return _store_factory;
|
||||
}
|
||||
@@ -673,28 +544,6 @@ private:
|
||||
/// \param endpoint node that left the cluster
|
||||
void drain_for(gms::inet_address endpoint);
|
||||
|
||||
void update_backlog(size_t backlog, size_t max_backlog);
|
||||
|
||||
bool stopping() const noexcept {
|
||||
return _state.contains(state::stopping);
|
||||
}
|
||||
|
||||
void set_stopping() noexcept {
|
||||
_state.set(state::stopping);
|
||||
}
|
||||
|
||||
bool started() const noexcept {
|
||||
return _state.contains(state::started);
|
||||
}
|
||||
|
||||
void set_started() noexcept {
|
||||
_state.set(state::started);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _state.contains(state::replay_allowed);
|
||||
}
|
||||
|
||||
public:
|
||||
ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
|
||||
return _ep_managers.find(ep_key);
|
||||
|
||||
@@ -27,7 +27,6 @@
|
||||
#include "lister.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
#include "seastarx.hh"
|
||||
#include <seastar/core/sleep.hh>
|
||||
|
||||
namespace db {
|
||||
namespace hints {
|
||||
@@ -66,111 +65,112 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
|
||||
space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
|
||||
: _shard_managers(managers)
|
||||
, _per_device_limits_map(per_device_limits_map)
|
||||
, _timer([this] { on_timer(); })
|
||||
{}
|
||||
|
||||
void space_watchdog::start() {
|
||||
_started = seastar::async([this] {
|
||||
while (!_as.abort_requested()) {
|
||||
try {
|
||||
on_timer();
|
||||
} catch (...) {
|
||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||
// Stop all hint generators if space_watchdog callback failed
|
||||
for (manager& shard_manager : _shard_managers) {
|
||||
shard_manager.forbid_hints();
|
||||
}
|
||||
}
|
||||
seastar::sleep_abortable(_watchdog_period, _as).get();
|
||||
}
|
||||
}).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
|
||||
_timer.arm(timer_clock_type::now());
|
||||
}
|
||||
|
||||
future<> space_watchdog::stop() noexcept {
|
||||
_as.request_abort();
|
||||
return std::move(_started);
|
||||
try {
|
||||
return _gate.close().finally([this] { _timer.cancel(); });
|
||||
} catch (...) {
|
||||
return make_exception_future<>(std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
|
||||
future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
|
||||
return do_with(std::move(path), [this, ep_key, &shard_manager] (boost::filesystem::path& path) {
|
||||
// It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
|
||||
// In this case simply bail out.
|
||||
return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
|
||||
if (!exists) {
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
|
||||
if (_files_count == 1) {
|
||||
shard_manager.add_ep_with_pending_hints(ep_key);
|
||||
}
|
||||
++_files_count;
|
||||
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
|
||||
if (_files_count == 1) {
|
||||
shard_manager.add_ep_with_pending_hints(ep_key);
|
||||
}
|
||||
++_files_count;
|
||||
|
||||
return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
|
||||
_total_size += fsize;
|
||||
});
|
||||
});
|
||||
}
|
||||
return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
|
||||
_total_size += fsize;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Called from the context of a seastar::thread.
|
||||
void space_watchdog::on_timer() {
|
||||
// The hints directories are organized as follows:
|
||||
// <hints root>
|
||||
// |- <shard1 ID>
|
||||
// | |- <EP1 address>
|
||||
// | |- <hints file1>
|
||||
// | |- <hints file2>
|
||||
// | |- ...
|
||||
// | |- <EP2 address>
|
||||
// | |- ...
|
||||
// | |-...
|
||||
// |- <shard2 ID>
|
||||
// | |- ...
|
||||
// ...
|
||||
// |- <shardN ID>
|
||||
// | |- ...
|
||||
//
|
||||
with_gate(_gate, [this] {
|
||||
return futurize_apply([this] {
|
||||
_total_size = 0;
|
||||
|
||||
for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
|
||||
_total_size = 0;
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.clear_eps_with_pending_hints();
|
||||
lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
_files_count = 0;
|
||||
// Let's scan per-end-point directories and enumerate hints files...
|
||||
return do_for_each(_shard_managers, [this] (manager& shard_manager) {
|
||||
shard_manager.clear_eps_with_pending_hints();
|
||||
|
||||
// The hints directories are organized as follows:
|
||||
// <hints root>
|
||||
// |- <shard1 ID>
|
||||
// | |- <EP1 address>
|
||||
// | |- <hints file1>
|
||||
// | |- <hints file2>
|
||||
// | |- ...
|
||||
// | |- <EP2 address>
|
||||
// | |- ...
|
||||
// | |-...
|
||||
// |- <shard2 ID>
|
||||
// | |- ...
|
||||
// ...
|
||||
// |- <shardN ID>
|
||||
// | |- ...
|
||||
//
|
||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||
// not hintable).
|
||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
_files_count = 0;
|
||||
// Let's scan per-end-point directories and enumerate hints files...
|
||||
//
|
||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||
// not hintable).
|
||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
});
|
||||
} else {
|
||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||
}
|
||||
});
|
||||
}).then([this] {
|
||||
return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
|
||||
space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
|
||||
|
||||
size_t adjusted_quota = 0;
|
||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||
});
|
||||
} else {
|
||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||
}
|
||||
}).get();
|
||||
}
|
||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||
}
|
||||
|
||||
// Adjust the quota to take into account the space we guarantee to every end point manager
|
||||
size_t adjusted_quota = 0;
|
||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||
});
|
||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||
}
|
||||
bool can_hint = _total_size < adjusted_quota;
|
||||
resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
|
||||
|
||||
resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.update_backlog(_total_size, adjusted_quota);
|
||||
}
|
||||
if (!can_hint) {
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.forbid_hints_for_eps_with_pending_hints();
|
||||
}
|
||||
} else {
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.allow_hints();
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}).handle_exception([this] (auto eptr) {
|
||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||
// Stop all hint generators if space_watchdog callback failed
|
||||
for (manager& shard_manager : _shard_managers) {
|
||||
shard_manager.forbid_hints();
|
||||
}
|
||||
}).finally([this] {
|
||||
_timer.arm(_watchdog_period);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
|
||||
@@ -183,10 +183,6 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
|
||||
});
|
||||
}
|
||||
|
||||
void resource_manager::allow_replaying() noexcept {
|
||||
boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
|
||||
}
|
||||
|
||||
future<> resource_manager::stop() noexcept {
|
||||
return parallel_for_each(_shard_managers, [](manager& m) {
|
||||
return m.stop();
|
||||
@@ -205,18 +201,14 @@ future<> resource_manager::prepare_per_device_limits() {
|
||||
auto it = _per_device_limits_map.find(device_id);
|
||||
if (it == _per_device_limits_map.end()) {
|
||||
return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
|
||||
auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
|
||||
// Since we possibly deferred, we need to recheck the _per_device_limits_map.
|
||||
if (inserted) {
|
||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||
it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||
// Then, reserve 90% of all space instead of 10% above.
|
||||
if (is_mountpoint) {
|
||||
it->second.max_shard_disk_space_size *= 9;
|
||||
}
|
||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||
size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||
// Then, reserve 90% of all space instead of 10% above.
|
||||
if (is_mountpoint) {
|
||||
max_size *= 9;
|
||||
}
|
||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||
_per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
|
||||
});
|
||||
} else {
|
||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||
|
||||
@@ -22,7 +22,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/memory.hh>
|
||||
@@ -79,8 +78,8 @@ private:
|
||||
shard_managers_set& _shard_managers;
|
||||
per_device_limits_map& _per_device_limits_map;
|
||||
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::abort_source _as;
|
||||
seastar::gate _gate;
|
||||
seastar::timer<timer_clock_type> _timer;
|
||||
int _files_count = 0;
|
||||
|
||||
public:
|
||||
@@ -138,9 +137,6 @@ public:
|
||||
, _space_watchdog(_shard_managers, _per_device_limits_map)
|
||||
{}
|
||||
|
||||
resource_manager(resource_manager&&) = delete;
|
||||
resource_manager& operator=(resource_manager&&) = delete;
|
||||
|
||||
future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);
|
||||
|
||||
bool too_many_hints_in_progress() const {
|
||||
@@ -160,7 +156,6 @@ public:
|
||||
}
|
||||
|
||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
void allow_replaying() noexcept;
|
||||
future<> stop() noexcept;
|
||||
void register_manager(manager& m);
|
||||
future<> prepare_per_device_limits();
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user