mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-21 09:00:35 +00:00
Compare commits
350 Commits
copilot/do
...
next-3.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afa2c1b0bf | ||
|
|
ad70fe8503 | ||
|
|
3cd9c78056 | ||
|
|
c5e5ed2775 | ||
|
|
666266c3cf | ||
|
|
19b5d70338 | ||
|
|
b3cdee7e27 | ||
|
|
4c42f18d82 | ||
|
|
ea8f8ab7a3 | ||
|
|
db6821ce8f | ||
|
|
3c91bad0dc | ||
|
|
bbe41a82be | ||
|
|
6fb42269e9 | ||
|
|
ee2255a189 | ||
|
|
3218e6cd4c | ||
|
|
1d94aac551 | ||
|
|
2e5110d063 | ||
|
|
e4bb7ce73c | ||
|
|
ecc54c1a68 | ||
|
|
71cfd108c6 | ||
|
|
d40a7a5e9e | ||
|
|
a163d245ec | ||
|
|
045831b706 | ||
|
|
148245ab6a | ||
|
|
bbe5de1403 | ||
|
|
ca0df416c0 | ||
|
|
37ed60374e | ||
|
|
7c991a276b | ||
|
|
72e039be85 | ||
|
|
a28ecc4714 | ||
|
|
584c555698 | ||
|
|
e772f11ee0 | ||
|
|
d79b6a7481 | ||
|
|
85168c500c | ||
|
|
5b9e2cd6e6 | ||
|
|
77f33ca106 | ||
|
|
93760f13ee | ||
|
|
e597ae1176 | ||
|
|
79c7015cce | ||
|
|
00a14000cd | ||
|
|
1c40a0fcd2 | ||
|
|
e10735852b | ||
|
|
42433a25a8 | ||
|
|
d04d3fa653 | ||
|
|
1bcc5a1b5c | ||
|
|
450b9ac9bf | ||
|
|
b3bfd8c08d | ||
|
|
53c10b72dc | ||
|
|
a690e20966 | ||
|
|
7172009a0d | ||
|
|
cb688ef62e | ||
|
|
ff8265dd66 | ||
|
|
a198db31dc | ||
|
|
094a2a4263 | ||
|
|
cc0b4d249b | ||
|
|
e10afc7f50 | ||
|
|
407dfe0d68 | ||
|
|
9370996a18 | ||
|
|
ac105dd2a7 | ||
|
|
1e62fc8aac | ||
|
|
c724eee649 | ||
|
|
ebb14d93c9 | ||
|
|
d77aaada86 | ||
|
|
acd05e089f | ||
|
|
f591c9c710 | ||
|
|
dea4489078 | ||
|
|
3172cc6bac | ||
|
|
840d466c4d | ||
|
|
e30c289835 | ||
|
|
f769828a68 | ||
|
|
7d743563bf | ||
|
|
23da53c4f3 | ||
|
|
d4df119735 | ||
|
|
bdcbf4aa4e | ||
|
|
e80cd9dfed | ||
|
|
87fd298a6e | ||
|
|
7dce5484c2 | ||
|
|
23df964b96 | ||
|
|
fcab0d1392 | ||
|
|
a0c4a8501e | ||
|
|
b6fa715f7b | ||
|
|
9b3ca26d7f | ||
|
|
7b8e570e6c | ||
|
|
a947f2cd84 | ||
|
|
5ce5f61b08 | ||
|
|
7b65ec866b | ||
|
|
4c16c1fe1b | ||
|
|
f2d2a9f5b8 | ||
|
|
cb3b687492 | ||
|
|
1bb84cdbcf | ||
|
|
b6307d54be | ||
|
|
a20000c1a2 | ||
|
|
b3cbc2e58a | ||
|
|
e4c1c4f052 | ||
|
|
bfe3b4cc59 | ||
|
|
6a4bc5bd71 | ||
|
|
6c818bcec0 | ||
|
|
1598d358f0 | ||
|
|
7252715c69 | ||
|
|
37e143cba5 | ||
|
|
bf68fae01b | ||
|
|
d566466fca | ||
|
|
e32e682911 | ||
|
|
3c46bbf244 | ||
|
|
5567cf4b1b | ||
|
|
733c04ad50 | ||
|
|
05913b6f58 | ||
|
|
79cf277ea2 | ||
|
|
03ada48b40 | ||
|
|
394afae3a8 | ||
|
|
69d0b1e15c | ||
|
|
403f66ecad | ||
|
|
841ceac4f9 | ||
|
|
0fce4b228e | ||
|
|
2336c092a0 | ||
|
|
2b326fc7fa | ||
|
|
a62edaf7a9 | ||
|
|
d527ef19f7 | ||
|
|
8568dc94f4 | ||
|
|
6e51a95668 | ||
|
|
071191b967 | ||
|
|
c6c841c34f | ||
|
|
83a8f779bb | ||
|
|
6066968e33 | ||
|
|
d3d877b9db | ||
|
|
5ec646cb4e | ||
|
|
68b54b2e52 | ||
|
|
66a48746b8 | ||
|
|
b2227c7a5e | ||
|
|
97357a7321 | ||
|
|
089e41999a | ||
|
|
c537b3dd8e | ||
|
|
75a737c958 | ||
|
|
ea0f1c039d | ||
|
|
27cf758f12 | ||
|
|
76fd69244a | ||
|
|
0eb2ea8f00 | ||
|
|
20eaf0b85f | ||
|
|
751fdc9f6c | ||
|
|
5e3a52024e | ||
|
|
3869b5ab51 | ||
|
|
3cca6f5384 | ||
|
|
15188b5ea5 | ||
|
|
96d9ebb67e | ||
|
|
f18b370198 | ||
|
|
8e657e5685 | ||
|
|
4fde670abf | ||
|
|
923318e636 | ||
|
|
35cc09b150 | ||
|
|
22f41f04ba | ||
|
|
fae11c0d6b | ||
|
|
82016c07f2 | ||
|
|
282ccbb072 | ||
|
|
873e0f0e14 | ||
|
|
c36c58c64e | ||
|
|
0685c8f5bc | ||
|
|
92cf2934c6 | ||
|
|
ed2fb65732 | ||
|
|
ce2957d106 | ||
|
|
b31d94e317 | ||
|
|
da80f27f44 | ||
|
|
5174b1cd13 | ||
|
|
9ba608cae4 | ||
|
|
f7c5cbc645 | ||
|
|
cf4b4d4878 | ||
|
|
45bb1ba1b7 | ||
|
|
28294ed42e | ||
|
|
3c4f8cf6ed | ||
|
|
7b94264ae5 | ||
|
|
22a085fbd3 | ||
|
|
2d181da656 | ||
|
|
d427a23d42 | ||
|
|
37ab553f02 | ||
|
|
6a3f4fb3f9 | ||
|
|
8168d13887 | ||
|
|
13bdec6eb4 | ||
|
|
57e7081d86 | ||
|
|
2fcae36d96 | ||
|
|
ba62dcd5c7 | ||
|
|
515399ce17 | ||
|
|
772c4b5fdc | ||
|
|
874d88c98d | ||
|
|
5a178ff635 | ||
|
|
d67439b910 | ||
|
|
21a5a4c76a | ||
|
|
f818d6ee3f | ||
|
|
20c2745592 | ||
|
|
cf5c72561c | ||
|
|
53b85e5d32 | ||
|
|
2456cf63f2 | ||
|
|
c1f6ce4251 | ||
|
|
fc82eb5586 | ||
|
|
f58e592345 | ||
|
|
6375b1e5b7 | ||
|
|
7ca24efb39 | ||
|
|
32ebaaa585 | ||
|
|
a88c722a4c | ||
|
|
07582d6c10 | ||
|
|
18c89edbf7 | ||
|
|
5558fa8c44 | ||
|
|
f678eb52cd | ||
|
|
dfb23f4b38 | ||
|
|
502ddf158a | ||
|
|
0ccb0a127a | ||
|
|
b94997be0d | ||
|
|
d3a5b10cb8 | ||
|
|
48f3f899ac | ||
|
|
c4f745276c | ||
|
|
392c7dee3c | ||
|
|
04e982f909 | ||
|
|
97a8cc149e | ||
|
|
dbe347811c | ||
|
|
8f2d24bb8f | ||
|
|
689e11c892 | ||
|
|
1766c793a8 | ||
|
|
0b09008cde | ||
|
|
713e60f690 | ||
|
|
7b6841f947 | ||
|
|
f124b7026f | ||
|
|
28cca751d1 | ||
|
|
21d08aa41e | ||
|
|
f0b5170fa6 | ||
|
|
3b617e873c | ||
|
|
4eb9836e64 | ||
|
|
46af353209 | ||
|
|
76f70c676e | ||
|
|
afc9f0e177 | ||
|
|
c899191ad5 | ||
|
|
a3563e5f7d | ||
|
|
78c5b09694 | ||
|
|
a51878205a | ||
|
|
46efc08882 | ||
|
|
c95433c967 | ||
|
|
df3b6fb4a8 | ||
|
|
44ee43bb17 | ||
|
|
aac363ca86 | ||
|
|
9e6cc5b024 | ||
|
|
13b72c7b92 | ||
|
|
6b011fbe0a | ||
|
|
9dd4e1b01f | ||
|
|
e91c741ef5 | ||
|
|
b18e9e115d | ||
|
|
0b86ab0d2a | ||
|
|
97cd9108d6 | ||
|
|
f81fe96b0b | ||
|
|
91ce3a7957 | ||
|
|
af7e58f4c5 | ||
|
|
bd3373b511 | ||
|
|
4820130abe | ||
|
|
9b299241e5 | ||
|
|
745a98e151 | ||
|
|
b9c99af18b | ||
|
|
cded9c7ac7 | ||
|
|
4acfc5ed8f | ||
|
|
cb9199bc7f | ||
|
|
695ff5383f | ||
|
|
730e48bf60 | ||
|
|
af6d4f40e1 | ||
|
|
9d8507de09 | ||
|
|
07c980845d | ||
|
|
c52b8239d0 | ||
|
|
5a07a4fac8 | ||
|
|
b9c046b17b | ||
|
|
979cb636b8 | ||
|
|
59cf9d9070 | ||
|
|
c9ec9d4087 | ||
|
|
2e8fefbc5a | ||
|
|
6be0635029 | ||
|
|
04a544c0a2 | ||
|
|
028f9b95d1 | ||
|
|
54258ca8eb | ||
|
|
c9a030f1f0 | ||
|
|
1c7daef554 | ||
|
|
f8195a77b0 | ||
|
|
5b724c80ab | ||
|
|
4a7ae81b3f | ||
|
|
3cf26a60a2 | ||
|
|
2103d0d52b | ||
|
|
16ee3b3ebe | ||
|
|
b0a9c40ab1 | ||
|
|
53924e5c7f | ||
|
|
befe0012f5 | ||
|
|
1953c5fa61 | ||
|
|
b72a94b53e | ||
|
|
3f82b697f2 | ||
|
|
ee1ef853e5 | ||
|
|
6e7e7f3822 | ||
|
|
82a36edc9d | ||
|
|
d4efa3c9b2 | ||
|
|
324dae3e12 | ||
|
|
c0ffc9a2b7 | ||
|
|
f81fa5f75c | ||
|
|
6fd1cfcfce | ||
|
|
9d458ffea9 | ||
|
|
9776a048e7 | ||
|
|
10cf97375e | ||
|
|
e6355a9a01 | ||
|
|
e57907a1d5 | ||
|
|
f94b46e7e0 | ||
|
|
6847c12668 | ||
|
|
80b86def1f | ||
|
|
c6de9ea39b | ||
|
|
94bed81c1d | ||
|
|
0f3a21f0bb | ||
|
|
976db7e9e0 | ||
|
|
996b86b804 | ||
|
|
b7b217cc43 | ||
|
|
c274430933 | ||
|
|
893a18a7c4 | ||
|
|
39b39058fc | ||
|
|
6bf4a73d88 | ||
|
|
ca4846dd63 | ||
|
|
2663ff7bc1 | ||
|
|
043a575fcd | ||
|
|
00dc400993 | ||
|
|
522a48a244 | ||
|
|
5faa28ce45 | ||
|
|
52be02558e | ||
|
|
a7cbfbe63f | ||
|
|
28fd2044d2 | ||
|
|
76ff2e5c3d | ||
|
|
7b34d54a96 | ||
|
|
26c31f6798 | ||
|
|
28fa66591a | ||
|
|
0fee1d9e43 | ||
|
|
76e72e28f4 | ||
|
|
f969e80965 | ||
|
|
2029134063 | ||
|
|
f30fe7bd17 | ||
|
|
aeb418af9e | ||
|
|
714e6d741f | ||
|
|
95c5872450 | ||
|
|
87f8968553 | ||
|
|
2895428d44 | ||
|
|
e18f182cfc | ||
|
|
cf8cdbf87d | ||
|
|
eb2814067d | ||
|
|
0c722d4547 | ||
|
|
54cf463430 | ||
|
|
d2a0622edd | ||
|
|
60edaec757 | ||
|
|
5802532cb3 | ||
|
|
83ea91055e | ||
|
|
e7863d3d54 | ||
|
|
57f124b905 | ||
|
|
40d8de5784 | ||
|
|
1468ec62de | ||
|
|
c6ef56ae1e | ||
|
|
ad62313b86 | ||
|
|
de87f798e1 |
5
.gitmodules
vendored
5
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
|||||||
[submodule "seastar"]
|
[submodule "seastar"]
|
||||||
path = seastar
|
path = seastar
|
||||||
url = ../seastar
|
url = ../scylla-seastar
|
||||||
ignore = dirty
|
ignore = dirty
|
||||||
[submodule "swagger-ui"]
|
[submodule "swagger-ui"]
|
||||||
path = swagger-ui
|
path = swagger-ui
|
||||||
@@ -9,3 +9,6 @@
|
|||||||
[submodule "xxHash"]
|
[submodule "xxHash"]
|
||||||
path = xxHash
|
path = xxHash
|
||||||
url = ../xxHash
|
url = ../xxHash
|
||||||
|
[submodule "libdeflate"]
|
||||||
|
path = libdeflate
|
||||||
|
url = ../libdeflate
|
||||||
|
|||||||
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
|
|||||||
${SEASTAR_INCLUDE_DIRS}
|
${SEASTAR_INCLUDE_DIRS}
|
||||||
${Boost_INCLUDE_DIRS}
|
${Boost_INCLUDE_DIRS}
|
||||||
xxhash
|
xxhash
|
||||||
|
libdeflate
|
||||||
build/release/gen)
|
build/release/gen)
|
||||||
|
|||||||
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
|
|||||||
./dist/redhat/build_rpm.sh
|
./dist/redhat/build_rpm.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
|
The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
|
||||||
For example, on Fedora 21 mock reports the following:
|
For example, on Fedora 21 mock reports the following:
|
||||||
|
|
||||||
```
|
```
|
||||||
INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
|
INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
|
||||||
INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
|
INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
|
||||||
```
|
```
|
||||||
|
|
||||||
## Building Fedora-based Docker image
|
## Building Fedora-based Docker image
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
|
||||||
VERSION=666.development
|
VERSION=3.0.11
|
||||||
|
|
||||||
if test -f version
|
if test -f version
|
||||||
then
|
then
|
||||||
|
|||||||
@@ -2228,11 +2228,11 @@
|
|||||||
"description":"The column family"
|
"description":"The column family"
|
||||||
},
|
},
|
||||||
"total":{
|
"total":{
|
||||||
"type":"int",
|
"type":"long",
|
||||||
"description":"The total snapshot size"
|
"description":"The total snapshot size"
|
||||||
},
|
},
|
||||||
"live":{
|
"live":{
|
||||||
"type":"int",
|
"type":"long",
|
||||||
"description":"The live snapshot size"
|
"description":"The live snapshot size"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
|
|||||||
return mm.announce_new_column_family(b.build(), false);
|
return mm.announce_new_column_family(b.build(), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
|
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
|
||||||
static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
|
static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
|
||||||
|
|
||||||
return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
|
return do_until([&db, &as] {
|
||||||
return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
|
as.check();
|
||||||
|
return db.get_version() != database::empty_version;
|
||||||
|
}, pause).then([&mm, &as] {
|
||||||
|
return do_until([&mm, &as] {
|
||||||
|
as.check();
|
||||||
|
return mm.have_schema_agreement();
|
||||||
|
}, pause);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
|
|||||||
stdx::string_view cql,
|
stdx::string_view cql,
|
||||||
::service::migration_manager&);
|
::service::migration_manager&);
|
||||||
|
|
||||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
|
future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Time-outs for internal, non-local CQL queries.
|
/// Time-outs for internal, non-local CQL queries.
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
|
|||||||
_migration_manager).then([this] {
|
_migration_manager).then([this] {
|
||||||
_finished = do_after_system_ready(_as, [this] {
|
_finished = do_after_system_ready(_as, [this] {
|
||||||
return async([this] {
|
return async([this] {
|
||||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||||
|
|
||||||
if (legacy_metadata_exists()) {
|
if (legacy_metadata_exists()) {
|
||||||
if (!any_granted().get0()) {
|
if (!any_granted().get0()) {
|
||||||
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {
|
|||||||
|
|
||||||
future<> default_authorizer::stop() {
|
future<> default_authorizer::stop() {
|
||||||
_as.request_abort();
|
_as.request_abort();
|
||||||
return _finished.handle_exception_type([](const sleep_aborted&) {});
|
return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
|
||||||
}
|
}
|
||||||
|
|
||||||
future<permission_set>
|
future<permission_set>
|
||||||
|
|||||||
@@ -157,7 +157,7 @@ future<> password_authenticator::start() {
|
|||||||
|
|
||||||
_stopped = do_after_system_ready(_as, [this] {
|
_stopped = do_after_system_ready(_as, [this] {
|
||||||
return async([this] {
|
return async([this] {
|
||||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||||
|
|
||||||
if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
|
if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
|
||||||
if (legacy_metadata_exists()) {
|
if (legacy_metadata_exists()) {
|
||||||
@@ -182,7 +182,7 @@ future<> password_authenticator::start() {
|
|||||||
|
|
||||||
future<> password_authenticator::stop() {
|
future<> password_authenticator::stop() {
|
||||||
_as.request_abort();
|
_as.request_abort();
|
||||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
|
||||||
}
|
}
|
||||||
|
|
||||||
db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
|
db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
|
||||||
@@ -241,7 +241,11 @@ future<authenticated_user> password_authenticator::authenticate(
|
|||||||
}).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
|
}).then_wrapped([=](future<::shared_ptr<cql3::untyped_result_set>> f) {
|
||||||
try {
|
try {
|
||||||
auto res = f.get0();
|
auto res = f.get0();
|
||||||
if (res->empty() || !passwords::check(password, res->one().get_as<sstring>(SALTED_HASH))) {
|
auto salted_hash = std::experimental::optional<sstring>();
|
||||||
|
if (!res->empty()) {
|
||||||
|
salted_hash = res->one().get_opt<sstring>(SALTED_HASH);
|
||||||
|
}
|
||||||
|
if (!salted_hash || !passwords::check(password, *salted_hash)) {
|
||||||
throw exceptions::authentication_exception("Username and/or password are incorrect");
|
throw exceptions::authentication_exception("Username and/or password are incorrect");
|
||||||
}
|
}
|
||||||
return make_ready_future<authenticated_user>(username);
|
return make_ready_future<authenticated_user>(username);
|
||||||
|
|||||||
@@ -184,7 +184,9 @@ future<> service::start() {
|
|||||||
return once_among_shards([this] {
|
return once_among_shards([this] {
|
||||||
return create_keyspace_if_missing();
|
return create_keyspace_if_missing();
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
return when_all_succeed(_role_manager->start(), _authorizer->start(), _authenticator->start());
|
return _role_manager->start().then([this] {
|
||||||
|
return when_all_succeed(_authorizer->start(), _authenticator->start());
|
||||||
|
});
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
_permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
|
_permissions_cache = std::make_unique<permissions_cache>(_permissions_cache_config, *this, log);
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
@@ -196,6 +198,10 @@ future<> service::start() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
future<> service::stop() {
|
future<> service::stop() {
|
||||||
|
// Only one of the shards has the listener registered, but let's try to
|
||||||
|
// unregister on each one just to make sure.
|
||||||
|
_migration_manager.unregister_listener(_migration_listener.get());
|
||||||
|
|
||||||
return _permissions_cache->stop().then([this] {
|
return _permissions_cache->stop().then([this] {
|
||||||
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
|
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
|
|||||||
return this->create_metadata_tables_if_missing().then([this] {
|
return this->create_metadata_tables_if_missing().then([this] {
|
||||||
_stopped = auth::do_after_system_ready(_as, [this] {
|
_stopped = auth::do_after_system_ready(_as, [this] {
|
||||||
return seastar::async([this] {
|
return seastar::async([this] {
|
||||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||||
|
|
||||||
if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
|
if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
|
||||||
if (this->legacy_metadata_exists()) {
|
if (this->legacy_metadata_exists()) {
|
||||||
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {
|
|||||||
|
|
||||||
future<> standard_role_manager::stop() {
|
future<> standard_role_manager::stop() {
|
||||||
_as.request_abort();
|
_as.request_abort();
|
||||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
|
future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ protected:
|
|||||||
, _io_priority(iop)
|
, _io_priority(iop)
|
||||||
, _interval(interval)
|
, _interval(interval)
|
||||||
, _update_timer([this] { adjust(); })
|
, _update_timer([this] { adjust(); })
|
||||||
, _control_points({{0,0}})
|
, _control_points()
|
||||||
, _current_backlog(std::move(backlog))
|
, _current_backlog(std::move(backlog))
|
||||||
, _inflight_update(make_ready_future<>())
|
, _inflight_update(make_ready_future<>())
|
||||||
{
|
{
|
||||||
@@ -125,7 +125,7 @@ public:
|
|||||||
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
||||||
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
|
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
|
||||||
: backlog_controller(sg, iop, std::move(interval),
|
: backlog_controller(sg, iop, std::move(interval),
|
||||||
std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
|
std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
|
||||||
std::move(current_dirty)
|
std::move(current_dirty)
|
||||||
)
|
)
|
||||||
{}
|
{}
|
||||||
@@ -139,7 +139,7 @@ public:
|
|||||||
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
||||||
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
|
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
|
||||||
: backlog_controller(sg, iop, std::move(interval),
|
: backlog_controller(sg, iop, std::move(interval),
|
||||||
std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
|
std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
|
||||||
std::move(current_backlog)
|
std::move(current_backlog)
|
||||||
)
|
)
|
||||||
{}
|
{}
|
||||||
|
|||||||
@@ -57,12 +57,12 @@ private:
|
|||||||
value_type data[0];
|
value_type data[0];
|
||||||
void operator delete(void* ptr) { free(ptr); }
|
void operator delete(void* ptr) { free(ptr); }
|
||||||
};
|
};
|
||||||
// FIXME: consider increasing chunk size as the buffer grows
|
static constexpr size_type default_chunk_size{512};
|
||||||
static constexpr size_type chunk_size{512};
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<chunk> _begin;
|
std::unique_ptr<chunk> _begin;
|
||||||
chunk* _current;
|
chunk* _current;
|
||||||
size_type _size;
|
size_type _size;
|
||||||
|
size_type _initial_chunk_size = default_chunk_size;
|
||||||
public:
|
public:
|
||||||
class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
|
class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
|
||||||
chunk* _current = nullptr;
|
chunk* _current = nullptr;
|
||||||
@@ -102,13 +102,13 @@ private:
|
|||||||
}
|
}
|
||||||
// Figure out next chunk size.
|
// Figure out next chunk size.
|
||||||
// - must be enough for data_size
|
// - must be enough for data_size
|
||||||
// - must be at least chunk_size
|
// - must be at least _initial_chunk_size
|
||||||
// - try to double each time to prevent too many allocations
|
// - try to double each time to prevent too many allocations
|
||||||
// - do not exceed max_chunk_size
|
// - do not exceed max_chunk_size
|
||||||
size_type next_alloc_size(size_t data_size) const {
|
size_type next_alloc_size(size_t data_size) const {
|
||||||
auto next_size = _current
|
auto next_size = _current
|
||||||
? _current->size * 2
|
? _current->size * 2
|
||||||
: chunk_size;
|
: _initial_chunk_size;
|
||||||
next_size = std::min(next_size, max_chunk_size());
|
next_size = std::min(next_size, max_chunk_size());
|
||||||
// FIXME: check for overflow?
|
// FIXME: check for overflow?
|
||||||
return std::max<size_type>(next_size, data_size + sizeof(chunk));
|
return std::max<size_type>(next_size, data_size + sizeof(chunk));
|
||||||
@@ -116,13 +116,19 @@ private:
|
|||||||
// Makes room for a contiguous region of given size.
|
// Makes room for a contiguous region of given size.
|
||||||
// The region is accounted for as already written.
|
// The region is accounted for as already written.
|
||||||
// size must not be zero.
|
// size must not be zero.
|
||||||
|
[[gnu::always_inline]]
|
||||||
value_type* alloc(size_type size) {
|
value_type* alloc(size_type size) {
|
||||||
if (size <= current_space_left()) {
|
if (__builtin_expect(size <= current_space_left(), true)) {
|
||||||
auto ret = _current->data + _current->offset;
|
auto ret = _current->data + _current->offset;
|
||||||
_current->offset += size;
|
_current->offset += size;
|
||||||
_size += size;
|
_size += size;
|
||||||
return ret;
|
return ret;
|
||||||
} else {
|
} else {
|
||||||
|
return alloc_new(size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
[[gnu::noinline]]
|
||||||
|
value_type* alloc_new(size_type size) {
|
||||||
auto alloc_size = next_alloc_size(size);
|
auto alloc_size = next_alloc_size(size);
|
||||||
auto space = malloc(alloc_size);
|
auto space = malloc(alloc_size);
|
||||||
if (!space) {
|
if (!space) {
|
||||||
@@ -140,19 +146,22 @@ private:
|
|||||||
}
|
}
|
||||||
_size += size;
|
_size += size;
|
||||||
return _current->data;
|
return _current->data;
|
||||||
};
|
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
bytes_ostream() noexcept
|
explicit bytes_ostream(size_t initial_chunk_size) noexcept
|
||||||
: _begin()
|
: _begin()
|
||||||
, _current(nullptr)
|
, _current(nullptr)
|
||||||
, _size(0)
|
, _size(0)
|
||||||
|
, _initial_chunk_size(initial_chunk_size)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
|
||||||
|
|
||||||
bytes_ostream(bytes_ostream&& o) noexcept
|
bytes_ostream(bytes_ostream&& o) noexcept
|
||||||
: _begin(std::move(o._begin))
|
: _begin(std::move(o._begin))
|
||||||
, _current(o._current)
|
, _current(o._current)
|
||||||
, _size(o._size)
|
, _size(o._size)
|
||||||
|
, _initial_chunk_size(o._initial_chunk_size)
|
||||||
{
|
{
|
||||||
o._current = nullptr;
|
o._current = nullptr;
|
||||||
o._size = 0;
|
o._size = 0;
|
||||||
@@ -162,6 +171,7 @@ public:
|
|||||||
: _begin()
|
: _begin()
|
||||||
, _current(nullptr)
|
, _current(nullptr)
|
||||||
, _size(0)
|
, _size(0)
|
||||||
|
, _initial_chunk_size(o._initial_chunk_size)
|
||||||
{
|
{
|
||||||
append(o);
|
append(o);
|
||||||
}
|
}
|
||||||
@@ -199,18 +209,20 @@ public:
|
|||||||
return place_holder<T>{alloc(sizeof(T))};
|
return place_holder<T>{alloc(sizeof(T))};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
value_type* write_place_holder(size_type size) {
|
value_type* write_place_holder(size_type size) {
|
||||||
return alloc(size);
|
return alloc(size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Writes given sequence of bytes
|
// Writes given sequence of bytes
|
||||||
|
[[gnu::always_inline]]
|
||||||
inline void write(bytes_view v) {
|
inline void write(bytes_view v) {
|
||||||
if (v.empty()) {
|
if (v.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto this_size = std::min(v.size(), size_t(current_space_left()));
|
auto this_size = std::min(v.size(), size_t(current_space_left()));
|
||||||
if (this_size) {
|
if (__builtin_expect(this_size, true)) {
|
||||||
memcpy(_current->data + _current->offset, v.begin(), this_size);
|
memcpy(_current->data + _current->offset, v.begin(), this_size);
|
||||||
_current->offset += this_size;
|
_current->offset += this_size;
|
||||||
_size += this_size;
|
_size += this_size;
|
||||||
@@ -219,11 +231,12 @@ public:
|
|||||||
|
|
||||||
while (!v.empty()) {
|
while (!v.empty()) {
|
||||||
auto this_size = std::min(v.size(), size_t(max_chunk_size()));
|
auto this_size = std::min(v.size(), size_t(max_chunk_size()));
|
||||||
std::copy_n(v.begin(), this_size, alloc(this_size));
|
std::copy_n(v.begin(), this_size, alloc_new(this_size));
|
||||||
v.remove_prefix(this_size);
|
v.remove_prefix(this_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[gnu::always_inline]]
|
||||||
void write(const char* ptr, size_t size) {
|
void write(const char* ptr, size_t size) {
|
||||||
write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
|
write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
|
||||||
}
|
}
|
||||||
@@ -393,6 +406,21 @@ public:
|
|||||||
bool operator!=(const bytes_ostream& other) const {
|
bool operator!=(const bytes_ostream& other) const {
|
||||||
return !(*this == other);
|
return !(*this == other);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Makes this instance empty.
|
||||||
|
//
|
||||||
|
// The first buffer is not deallocated, so callers may rely on the
|
||||||
|
// fact that if they write less than the initial chunk size between
|
||||||
|
// the clear() calls then writes will not involve any memory allocations,
|
||||||
|
// except for the first write made on this instance.
|
||||||
|
void clear() {
|
||||||
|
if (_begin) {
|
||||||
|
_begin->offset = 0;
|
||||||
|
_size = 0;
|
||||||
|
_current = _begin.get();
|
||||||
|
_begin->next.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
|||||||
// - _last_row points at a direct predecessor of the next row which is going to be read.
|
// - _last_row points at a direct predecessor of the next row which is going to be read.
|
||||||
// Used for populating continuity.
|
// Used for populating continuity.
|
||||||
// - _population_range_starts_before_all_rows is set accordingly
|
// - _population_range_starts_before_all_rows is set accordingly
|
||||||
|
// - _underlying is engaged and fast-forwarded
|
||||||
reading_from_underlying,
|
reading_from_underlying,
|
||||||
|
|
||||||
end_of_stream
|
end_of_stream
|
||||||
@@ -99,7 +100,13 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
|||||||
// forward progress is not guaranteed in case iterators are getting constantly invalidated.
|
// forward progress is not guaranteed in case iterators are getting constantly invalidated.
|
||||||
bool _lower_bound_changed = false;
|
bool _lower_bound_changed = false;
|
||||||
|
|
||||||
|
// Points to the underlying reader conforming to _schema,
|
||||||
|
// either to *_underlying_holder or _read_context->underlying().underlying().
|
||||||
|
flat_mutation_reader* _underlying = nullptr;
|
||||||
|
std::optional<flat_mutation_reader> _underlying_holder;
|
||||||
|
|
||||||
future<> do_fill_buffer(db::timeout_clock::time_point);
|
future<> do_fill_buffer(db::timeout_clock::time_point);
|
||||||
|
future<> ensure_underlying(db::timeout_clock::time_point);
|
||||||
void copy_from_cache_to_buffer();
|
void copy_from_cache_to_buffer();
|
||||||
future<> process_static_row(db::timeout_clock::time_point);
|
future<> process_static_row(db::timeout_clock::time_point);
|
||||||
void move_to_end();
|
void move_to_end();
|
||||||
@@ -186,23 +193,22 @@ future<> cache_flat_mutation_reader::process_static_row(db::timeout_clock::time_
|
|||||||
return make_ready_future<>();
|
return make_ready_future<>();
|
||||||
} else {
|
} else {
|
||||||
_read_context->cache().on_row_miss();
|
_read_context->cache().on_row_miss();
|
||||||
return _read_context->get_next_fragment(timeout).then([this] (mutation_fragment_opt&& sr) {
|
return ensure_underlying(timeout).then([this, timeout] {
|
||||||
if (sr) {
|
return (*_underlying)(timeout).then([this] (mutation_fragment_opt&& sr) {
|
||||||
assert(sr->is_static_row());
|
if (sr) {
|
||||||
maybe_add_to_cache(sr->as_static_row());
|
assert(sr->is_static_row());
|
||||||
push_mutation_fragment(std::move(*sr));
|
maybe_add_to_cache(sr->as_static_row());
|
||||||
}
|
push_mutation_fragment(std::move(*sr));
|
||||||
maybe_set_static_row_continuous();
|
}
|
||||||
|
maybe_set_static_row_continuous();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline
|
inline
|
||||||
void cache_flat_mutation_reader::touch_partition() {
|
void cache_flat_mutation_reader::touch_partition() {
|
||||||
if (_snp->at_latest_version()) {
|
_snp->touch();
|
||||||
rows_entry& last_dummy = *_snp->version()->partition().clustered_rows().rbegin();
|
|
||||||
_snp->tracker()->touch(last_dummy);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline
|
inline
|
||||||
@@ -232,14 +238,36 @@ future<> cache_flat_mutation_reader::fill_buffer(db::timeout_clock::time_point t
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline
|
||||||
|
future<> cache_flat_mutation_reader::ensure_underlying(db::timeout_clock::time_point timeout) {
|
||||||
|
if (_underlying) {
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
return _read_context->ensure_underlying(timeout).then([this, timeout] {
|
||||||
|
flat_mutation_reader& ctx_underlying = _read_context->underlying().underlying();
|
||||||
|
if (ctx_underlying.schema() != _schema) {
|
||||||
|
_underlying_holder = make_delegating_reader(ctx_underlying);
|
||||||
|
_underlying_holder->upgrade_schema(_schema);
|
||||||
|
_underlying = &*_underlying_holder;
|
||||||
|
} else {
|
||||||
|
_underlying = &ctx_underlying;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
inline
|
inline
|
||||||
future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||||
if (_state == state::move_to_underlying) {
|
if (_state == state::move_to_underlying) {
|
||||||
|
if (!_underlying) {
|
||||||
|
return ensure_underlying(timeout).then([this, timeout] {
|
||||||
|
return do_fill_buffer(timeout);
|
||||||
|
});
|
||||||
|
}
|
||||||
_state = state::reading_from_underlying;
|
_state = state::reading_from_underlying;
|
||||||
_population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
|
_population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
|
||||||
auto end = _next_row_in_range ? position_in_partition(_next_row.position())
|
auto end = _next_row_in_range ? position_in_partition(_next_row.position())
|
||||||
: position_in_partition(_upper_bound);
|
: position_in_partition(_upper_bound);
|
||||||
return _read_context->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
||||||
return read_from_underlying(timeout);
|
return read_from_underlying(timeout);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -280,7 +308,7 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
|
|||||||
|
|
||||||
inline
|
inline
|
||||||
future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
|
future<> cache_flat_mutation_reader::read_from_underlying(db::timeout_clock::time_point timeout) {
|
||||||
return consume_mutation_fragments_until(_read_context->underlying().underlying(),
|
return consume_mutation_fragments_until(*_underlying,
|
||||||
[this] { return _state != state::reading_from_underlying || is_buffer_full(); },
|
[this] { return _state != state::reading_from_underlying || is_buffer_full(); },
|
||||||
[this] (mutation_fragment mf) {
|
[this] (mutation_fragment mf) {
|
||||||
_read_context->cache().on_row_miss();
|
_read_context->cache().on_row_miss();
|
||||||
|
|||||||
@@ -200,8 +200,9 @@ public:
|
|||||||
return _current_start;
|
return _current_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
position_in_partition_view upper_bound() const {
|
// Returns the upper bound of the last range in provided ranges set
|
||||||
return _current_end;
|
position_in_partition_view uppermost_bound() const {
|
||||||
|
return position_in_partition_view::for_range_end(_ranges.back());
|
||||||
}
|
}
|
||||||
|
|
||||||
// When lower_bound() changes, this also does
|
// When lower_bound() changes, this also does
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
|
|||||||
const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";
|
const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";
|
||||||
|
|
||||||
compression_parameters::compression_parameters()
|
compression_parameters::compression_parameters()
|
||||||
: compression_parameters(nullptr)
|
: compression_parameters(compressor::lz4)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
compression_parameters::~compression_parameters()
|
compression_parameters::~compression_parameters()
|
||||||
|
|||||||
@@ -118,6 +118,10 @@ public:
|
|||||||
std::map<sstring, sstring> get_options() const;
|
std::map<sstring, sstring> get_options() const;
|
||||||
bool operator==(const compression_parameters& other) const;
|
bool operator==(const compression_parameters& other) const;
|
||||||
bool operator!=(const compression_parameters& other) const;
|
bool operator!=(const compression_parameters& other) const;
|
||||||
|
|
||||||
|
static compression_parameters no_compression() {
|
||||||
|
return compression_parameters(nullptr);
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
void validate_options(const std::map<sstring, sstring>&);
|
void validate_options(const std::map<sstring, sstring>&);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -242,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50
|
|||||||
|
|
||||||
# The directory where hints files are stored if hinted handoff is enabled.
|
# The directory where hints files are stored if hinted handoff is enabled.
|
||||||
# hints_directory: /var/lib/scylla/hints
|
# hints_directory: /var/lib/scylla/hints
|
||||||
|
|
||||||
|
# The directory where hints files are stored for materialized-view updates
|
||||||
|
# view_hints_directory: /var/lib/scylla/view_hints
|
||||||
|
|
||||||
# See http://wiki.apache.org/cassandra/HintedHandoff
|
# See http://wiki.apache.org/cassandra/HintedHandoff
|
||||||
# May either be "true" or "false" to enable globally, or contain a list
|
# May either be "true" or "false" to enable globally, or contain a list
|
||||||
|
|||||||
36
configure.py
36
configure.py
@@ -197,7 +197,9 @@ class Thrift(object):
|
|||||||
|
|
||||||
def default_target_arch():
|
def default_target_arch():
|
||||||
if platform.machine() in ['i386', 'i686', 'x86_64']:
|
if platform.machine() in ['i386', 'i686', 'x86_64']:
|
||||||
return 'nehalem'
|
return 'westmere' # support PCLMUL
|
||||||
|
elif platform.machine() == 'aarch64':
|
||||||
|
return 'armv8-a+crc+crypto'
|
||||||
else:
|
else:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@@ -271,6 +273,8 @@ scylla_tests = [
|
|||||||
'tests/perf/perf_sstable',
|
'tests/perf/perf_sstable',
|
||||||
'tests/cql_query_test',
|
'tests/cql_query_test',
|
||||||
'tests/secondary_index_test',
|
'tests/secondary_index_test',
|
||||||
|
'tests/json_cql_query_test',
|
||||||
|
'tests/filtering_test',
|
||||||
'tests/storage_proxy_test',
|
'tests/storage_proxy_test',
|
||||||
'tests/schema_change_test',
|
'tests/schema_change_test',
|
||||||
'tests/mutation_reader_test',
|
'tests/mutation_reader_test',
|
||||||
@@ -306,6 +310,7 @@ scylla_tests = [
|
|||||||
'tests/log_heap_test',
|
'tests/log_heap_test',
|
||||||
'tests/managed_vector_test',
|
'tests/managed_vector_test',
|
||||||
'tests/crc_test',
|
'tests/crc_test',
|
||||||
|
'tests/checksum_utils_test',
|
||||||
'tests/flush_queue_test',
|
'tests/flush_queue_test',
|
||||||
'tests/dynamic_bitset_test',
|
'tests/dynamic_bitset_test',
|
||||||
'tests/auth_test',
|
'tests/auth_test',
|
||||||
@@ -356,6 +361,7 @@ scylla_tests = [
|
|||||||
|
|
||||||
perf_tests = [
|
perf_tests = [
|
||||||
'tests/perf/perf_mutation_readers',
|
'tests/perf/perf_mutation_readers',
|
||||||
|
'tests/perf/perf_checksum',
|
||||||
'tests/perf/perf_mutation_fragment',
|
'tests/perf/perf_mutation_fragment',
|
||||||
'tests/perf/perf_idl',
|
'tests/perf/perf_idl',
|
||||||
]
|
]
|
||||||
@@ -431,6 +437,7 @@ extra_cxxflags = {}
|
|||||||
cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')
|
cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')
|
||||||
|
|
||||||
scylla_core = (['database.cc',
|
scylla_core = (['database.cc',
|
||||||
|
'table.cc',
|
||||||
'atomic_cell.cc',
|
'atomic_cell.cc',
|
||||||
'schema.cc',
|
'schema.cc',
|
||||||
'frozen_schema.cc',
|
'frozen_schema.cc',
|
||||||
@@ -461,6 +468,7 @@ scylla_core = (['database.cc',
|
|||||||
'compress.cc',
|
'compress.cc',
|
||||||
'sstables/mp_row_consumer.cc',
|
'sstables/mp_row_consumer.cc',
|
||||||
'sstables/sstables.cc',
|
'sstables/sstables.cc',
|
||||||
|
'sstables/mc/writer.cc',
|
||||||
'sstables/sstable_version.cc',
|
'sstables/sstable_version.cc',
|
||||||
'sstables/compress.cc',
|
'sstables/compress.cc',
|
||||||
'sstables/row.cc',
|
'sstables/row.cc',
|
||||||
@@ -470,7 +478,6 @@ scylla_core = (['database.cc',
|
|||||||
'sstables/compaction_manager.cc',
|
'sstables/compaction_manager.cc',
|
||||||
'sstables/integrity_checked_file_impl.cc',
|
'sstables/integrity_checked_file_impl.cc',
|
||||||
'sstables/prepended_input_stream.cc',
|
'sstables/prepended_input_stream.cc',
|
||||||
'sstables/m_format_write_helpers.cc',
|
|
||||||
'sstables/m_format_read_helpers.cc',
|
'sstables/m_format_read_helpers.cc',
|
||||||
'transport/event.cc',
|
'transport/event.cc',
|
||||||
'transport/event_notifier.cc',
|
'transport/event_notifier.cc',
|
||||||
@@ -564,6 +571,7 @@ scylla_core = (['database.cc',
|
|||||||
'db/consistency_level.cc',
|
'db/consistency_level.cc',
|
||||||
'db/system_keyspace.cc',
|
'db/system_keyspace.cc',
|
||||||
'db/system_distributed_keyspace.cc',
|
'db/system_distributed_keyspace.cc',
|
||||||
|
'db/size_estimates_virtual_reader.cc',
|
||||||
'db/schema_tables.cc',
|
'db/schema_tables.cc',
|
||||||
'db/cql_type_parser.cc',
|
'db/cql_type_parser.cc',
|
||||||
'db/legacy_schema_migrator.cc',
|
'db/legacy_schema_migrator.cc',
|
||||||
@@ -579,6 +587,7 @@ scylla_core = (['database.cc',
|
|||||||
'db/marshal/type_parser.cc',
|
'db/marshal/type_parser.cc',
|
||||||
'db/batchlog_manager.cc',
|
'db/batchlog_manager.cc',
|
||||||
'db/view/view.cc',
|
'db/view/view.cc',
|
||||||
|
'db/view/view_update_from_staging_generator.cc',
|
||||||
'db/view/row_locking.cc',
|
'db/view/row_locking.cc',
|
||||||
'index/secondary_index_manager.cc',
|
'index/secondary_index_manager.cc',
|
||||||
'index/secondary_index.cc',
|
'index/secondary_index.cc',
|
||||||
@@ -592,6 +601,7 @@ scylla_core = (['database.cc',
|
|||||||
'utils/managed_bytes.cc',
|
'utils/managed_bytes.cc',
|
||||||
'utils/exceptions.cc',
|
'utils/exceptions.cc',
|
||||||
'utils/config_file.cc',
|
'utils/config_file.cc',
|
||||||
|
'utils/gz/crc_combine.cc',
|
||||||
'gms/version_generator.cc',
|
'gms/version_generator.cc',
|
||||||
'gms/versioned_value.cc',
|
'gms/versioned_value.cc',
|
||||||
'gms/gossiper.cc',
|
'gms/gossiper.cc',
|
||||||
@@ -682,6 +692,7 @@ scylla_core = (['database.cc',
|
|||||||
'data/cell.cc',
|
'data/cell.cc',
|
||||||
'multishard_writer.cc',
|
'multishard_writer.cc',
|
||||||
'multishard_mutation_query.cc',
|
'multishard_mutation_query.cc',
|
||||||
|
'reader_concurrency_semaphore.cc',
|
||||||
] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
|
] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -744,6 +755,7 @@ idls = ['idl/gossip_digest.idl.hh',
|
|||||||
'idl/tracing.idl.hh',
|
'idl/tracing.idl.hh',
|
||||||
'idl/consistency_level.idl.hh',
|
'idl/consistency_level.idl.hh',
|
||||||
'idl/cache_temperature.idl.hh',
|
'idl/cache_temperature.idl.hh',
|
||||||
|
'idl/view.idl.hh',
|
||||||
]
|
]
|
||||||
|
|
||||||
scylla_tests_dependencies = scylla_core + idls + [
|
scylla_tests_dependencies = scylla_core + idls + [
|
||||||
@@ -773,6 +785,7 @@ pure_boost_tests = set([
|
|||||||
'tests/test-serialization',
|
'tests/test-serialization',
|
||||||
'tests/range_test',
|
'tests/range_test',
|
||||||
'tests/crc_test',
|
'tests/crc_test',
|
||||||
|
'tests/checksum_utils_test',
|
||||||
'tests/managed_vector_test',
|
'tests/managed_vector_test',
|
||||||
'tests/dynamic_bitset_test',
|
'tests/dynamic_bitset_test',
|
||||||
'tests/idl_test',
|
'tests/idl_test',
|
||||||
@@ -1001,6 +1014,8 @@ seastar_ldflags = args.user_ldflags
|
|||||||
seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
|
seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
|
||||||
'--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]
|
'--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]
|
||||||
|
|
||||||
|
libdeflate_cflags = seastar_cflags
|
||||||
|
|
||||||
status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')
|
status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')
|
||||||
|
|
||||||
if status != 0:
|
if status != 0:
|
||||||
@@ -1100,6 +1115,9 @@ with open(buildfile, 'w') as f:
|
|||||||
command = {ninja} -C $subdir $target
|
command = {ninja} -C $subdir $target
|
||||||
restat = 1
|
restat = 1
|
||||||
description = NINJA $out
|
description = NINJA $out
|
||||||
|
rule run
|
||||||
|
command = $in > $out
|
||||||
|
description = GEN $out
|
||||||
rule copy
|
rule copy
|
||||||
command = cp $in $out
|
command = cp $in $out
|
||||||
description = COPY $out
|
description = COPY $out
|
||||||
@@ -1172,6 +1190,10 @@ with open(buildfile, 'w') as f:
|
|||||||
if binary.endswith('.a'):
|
if binary.endswith('.a'):
|
||||||
f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
|
f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
|
||||||
else:
|
else:
|
||||||
|
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
|
||||||
|
'libdeflate/libdeflate.a'
|
||||||
|
]])
|
||||||
|
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
|
||||||
if binary.startswith('tests/'):
|
if binary.startswith('tests/'):
|
||||||
local_libs = '$libs'
|
local_libs = '$libs'
|
||||||
if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
|
if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
|
||||||
@@ -1213,6 +1235,12 @@ with open(buildfile, 'w') as f:
|
|||||||
antlr3_grammars.add(src)
|
antlr3_grammars.add(src)
|
||||||
else:
|
else:
|
||||||
raise Exception('No rule for ' + src)
|
raise Exception('No rule for ' + src)
|
||||||
|
compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
|
||||||
|
compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
|
||||||
|
f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
|
||||||
|
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
|
||||||
|
f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
|
||||||
|
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
|
||||||
for obj in compiles:
|
for obj in compiles:
|
||||||
src = compiles[obj]
|
src = compiles[obj]
|
||||||
gen_headers = list(ragels.keys())
|
gen_headers = list(ragels.keys())
|
||||||
@@ -1262,6 +1290,10 @@ with open(buildfile, 'w') as f:
|
|||||||
''').format(**locals()))
|
''').format(**locals()))
|
||||||
f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
|
f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
|
||||||
f.write(' mode = {mode}\n'.format(**locals()))
|
f.write(' mode = {mode}\n'.format(**locals()))
|
||||||
|
f.write('rule libdeflate.{mode}\n'.format(**locals()))
|
||||||
|
f.write(' command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../build/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
|
||||||
|
f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
|
||||||
|
|
||||||
f.write('build {}: phony\n'.format(seastar_deps))
|
f.write('build {}: phony\n'.format(seastar_deps))
|
||||||
f.write(textwrap.dedent('''\
|
f.write(textwrap.dedent('''\
|
||||||
rule configure
|
rule configure
|
||||||
|
|||||||
@@ -38,44 +38,44 @@ private:
|
|||||||
static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
|
static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
|
||||||
return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
|
return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
|
||||||
}
|
}
|
||||||
|
static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
|
||||||
|
atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
|
||||||
|
if (cell.is_live() && !old_type.is_counter()) {
|
||||||
|
if (cell.is_live_and_has_ttl()) {
|
||||||
|
return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
|
||||||
|
}
|
||||||
|
return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
|
||||||
|
} else {
|
||||||
|
return atomic_cell(new_type, cell);
|
||||||
|
}
|
||||||
|
}
|
||||||
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
|
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
|
||||||
if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
|
if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto new_cell = [&] {
|
dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
|
||||||
if (cell.is_live() && !old_type->is_counter()) {
|
|
||||||
if (cell.is_live_and_has_ttl()) {
|
|
||||||
return atomic_cell_or_collection(
|
|
||||||
atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
return atomic_cell_or_collection(
|
|
||||||
atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
return atomic_cell_or_collection(*new_def.type, cell);
|
|
||||||
}
|
|
||||||
}();
|
|
||||||
dst.apply(new_def, std::move(new_cell));
|
|
||||||
}
|
}
|
||||||
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
|
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
|
||||||
if (!is_compatible(new_def, old_type, kind)) {
|
if (!is_compatible(new_def, old_type, kind)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cell.data.with_linearized([&] (bytes_view cell_bv) {
|
cell.data.with_linearized([&] (bytes_view cell_bv) {
|
||||||
auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
|
auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
|
||||||
auto old_view = ctype->deserialize_mutation_form(cell_bv);
|
auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
|
||||||
|
auto old_view = old_ctype->deserialize_mutation_form(cell_bv);
|
||||||
|
|
||||||
collection_type_impl::mutation_view new_view;
|
collection_type_impl::mutation new_view;
|
||||||
if (old_view.tomb.timestamp > new_def.dropped_at()) {
|
if (old_view.tomb.timestamp > new_def.dropped_at()) {
|
||||||
new_view.tomb = old_view.tomb;
|
new_view.tomb = old_view.tomb;
|
||||||
}
|
}
|
||||||
for (auto& c : old_view.cells) {
|
for (auto& c : old_view.cells) {
|
||||||
if (c.second.timestamp() > new_def.dropped_at()) {
|
if (c.second.timestamp() > new_def.dropped_at()) {
|
||||||
new_view.cells.emplace_back(std::move(c));
|
new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
|
if (new_view.tomb || !new_view.cells.empty()) {
|
||||||
|
dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
|
|||||||
@@ -470,6 +470,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
|
|||||||
std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
|
std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
|
||||||
std::vector<::shared_ptr<cql3::term::raw>> values;
|
std::vector<::shared_ptr<cql3::term::raw>> values;
|
||||||
bool if_not_exists = false;
|
bool if_not_exists = false;
|
||||||
|
bool default_unset = false;
|
||||||
::shared_ptr<cql3::term::raw> json_value;
|
::shared_ptr<cql3::term::raw> json_value;
|
||||||
}
|
}
|
||||||
: K_INSERT K_INTO cf=columnFamilyName
|
: K_INSERT K_INTO cf=columnFamilyName
|
||||||
@@ -487,13 +488,15 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
|
|||||||
}
|
}
|
||||||
| K_JSON
|
| K_JSON
|
||||||
json_token=jsonValue { json_value = $json_token.value; }
|
json_token=jsonValue { json_value = $json_token.value; }
|
||||||
|
( K_DEFAULT K_UNSET { default_unset = true; } | K_DEFAULT K_NULL )?
|
||||||
( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
|
( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
|
||||||
( usingClause[attrs] )?
|
( usingClause[attrs] )?
|
||||||
{
|
{
|
||||||
$expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
|
$expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
|
||||||
std::move(attrs),
|
std::move(attrs),
|
||||||
std::move(json_value),
|
std::move(json_value),
|
||||||
if_not_exists);
|
if_not_exists,
|
||||||
|
default_unset);
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
;
|
;
|
||||||
@@ -1835,6 +1838,8 @@ K_OR: O R;
|
|||||||
K_REPLACE: R E P L A C E;
|
K_REPLACE: R E P L A C E;
|
||||||
K_DETERMINISTIC: D E T E R M I N I S T I C;
|
K_DETERMINISTIC: D E T E R M I N I S T I C;
|
||||||
K_JSON: J S O N;
|
K_JSON: J S O N;
|
||||||
|
K_DEFAULT: D E F A U L T;
|
||||||
|
K_UNSET: U N S E T;
|
||||||
|
|
||||||
K_EMPTY: E M P T Y;
|
K_EMPTY: E M P T Y;
|
||||||
|
|
||||||
|
|||||||
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
|
|||||||
*/
|
*/
|
||||||
const sstring_view _query;
|
const sstring_view _query;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An empty bitset to be used as a workaround for AntLR null dereference
|
||||||
|
* bug.
|
||||||
|
*/
|
||||||
|
static typename ExceptionBaseType::BitsetListType _empty_bit_list;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -144,6 +150,14 @@ private:
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
// AntLR Exception class has a bug of dereferencing a null
|
||||||
|
// pointer in the displayRecognitionError. The following
|
||||||
|
// if statement makes sure it will not be null before the
|
||||||
|
// call to that function (displayRecognitionError).
|
||||||
|
// bug reference: https://github.com/antlr/antlr3/issues/191
|
||||||
|
if (!ex->get_expectingSet()) {
|
||||||
|
ex->set_expectingSet(&_empty_bit_list);
|
||||||
|
}
|
||||||
ex->displayRecognitionError(token_names, msg);
|
ex->displayRecognitionError(token_names, msg);
|
||||||
}
|
}
|
||||||
return msg.str();
|
return msg.str();
|
||||||
@@ -345,4 +359,8 @@ private:
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
|
||||||
|
typename ExceptionBaseType::BitsetListType
|
||||||
|
error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -130,6 +130,18 @@ query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<ser
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
query_options::query_options(std::unique_ptr<query_options> qo, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size)
|
||||||
|
: query_options(qo->_consistency,
|
||||||
|
qo->get_timeout_config(),
|
||||||
|
std::move(qo->_names),
|
||||||
|
std::move(qo->_values),
|
||||||
|
std::move(qo->_value_views),
|
||||||
|
qo->_skip_metadata,
|
||||||
|
std::move(query_options::specific_options{page_size, paging_state, qo->_options.serial_consistency, qo->_options.timestamp}),
|
||||||
|
qo->_cql_serialization_format) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
query_options::query_options(std::vector<cql3::raw_value> values)
|
query_options::query_options(std::vector<cql3::raw_value> values)
|
||||||
: query_options(
|
: query_options(
|
||||||
db::consistency_level::ONE, infinite_timeout_config, std::move(values))
|
db::consistency_level::ONE, infinite_timeout_config, std::move(values))
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
query_options(query_options&&) = default;
|
query_options(query_options&&) = default;
|
||||||
query_options(const query_options&) = delete;
|
explicit query_options(const query_options&) = default;
|
||||||
|
|
||||||
explicit query_options(db::consistency_level consistency,
|
explicit query_options(db::consistency_level consistency,
|
||||||
const timeout_config& timeouts,
|
const timeout_config& timeouts,
|
||||||
@@ -155,6 +155,7 @@ public:
|
|||||||
explicit query_options(db::consistency_level, const timeout_config& timeouts,
|
explicit query_options(db::consistency_level, const timeout_config& timeouts,
|
||||||
std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
|
std::vector<cql3::raw_value> values, specific_options options = specific_options::DEFAULT);
|
||||||
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
|
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state);
|
||||||
|
explicit query_options(std::unique_ptr<query_options>, ::shared_ptr<service::pager::paging_state> paging_state, int32_t page_size);
|
||||||
|
|
||||||
const timeout_config& get_timeout_config() const { return _timeout_config; }
|
const timeout_config& get_timeout_config() const { return _timeout_config; }
|
||||||
|
|
||||||
|
|||||||
@@ -100,12 +100,28 @@ public:
|
|||||||
bool has_unrestricted_components(const schema& schema) const;
|
bool has_unrestricted_components(const schema& schema) const;
|
||||||
|
|
||||||
virtual bool needs_filtering(const schema& schema) const;
|
virtual bool needs_filtering(const schema& schema) const;
|
||||||
|
|
||||||
|
// How long a prefix of the restrictions could have resulted in
|
||||||
|
// need_filtering() == false. These restrictions do not need to be
|
||||||
|
// applied during filtering.
|
||||||
|
// For example, if we have the filter "c1 < 3 and c2 > 3", c1 does
|
||||||
|
// not need filtering (just a read stopping at c1=3) but c2 does,
|
||||||
|
// so num_prefix_columns_that_need_not_be_filtered() will be 1.
|
||||||
|
virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
virtual bool is_all_eq() const {
|
virtual bool is_all_eq() const {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
virtual size_t prefix_size() const {
|
virtual size_t prefix_size() const {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t prefix_size(const schema_ptr schema) const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@@ -129,5 +145,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
|
||||||
|
size_t count = 0;
|
||||||
|
if (schema->clustering_key_columns().empty()) {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
auto column_defs = get_column_defs();
|
||||||
|
column_id expected_column_id = schema->clustering_key_columns().begin()->id;
|
||||||
|
for (auto&& cdef : column_defs) {
|
||||||
|
if (schema->position(*cdef) != expected_column_id) {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
expected_column_id++;
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -166,19 +166,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual size_t prefix_size() const override {
|
virtual size_t prefix_size() const override {
|
||||||
size_t count = 0;
|
return primary_key_restrictions<ValueType>::prefix_size(_schema);
|
||||||
if (_schema->clustering_key_columns().empty()) {
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
|
|
||||||
for (const auto& restriction_entry : _restrictions->restrictions()) {
|
|
||||||
if (_schema->position(*restriction_entry.first) != expected_column_id) {
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
expected_column_id++;
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
return count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
|
::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
|
||||||
@@ -419,6 +407,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual bool needs_filtering(const schema& schema) const override;
|
virtual bool needs_filtering(const schema& schema) const override;
|
||||||
|
virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
@@ -499,6 +488,39 @@ inline bool single_column_primary_key_restrictions<clustering_key>::needs_filter
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// How many of the restrictions (in column order) do not need filtering
|
||||||
|
// because they are implemented as a slice (potentially, a contiguous disk
|
||||||
|
// read). For example, if we have the filter "c1 < 3 and c2 > 3", c1 does not
|
||||||
|
// need filtering but c2 does so num_prefix_columns_that_need_not_be_filtered
|
||||||
|
// will be 1.
|
||||||
|
// The implementation of num_prefix_columns_that_need_not_be_filtered() is
|
||||||
|
// closely tied to that of needs_filtering() above - basically, if only the
|
||||||
|
// first num_prefix_columns_that_need_not_be_filtered() restrictions existed,
|
||||||
|
// then needs_filtering() would have returned false.
|
||||||
|
template<>
|
||||||
|
inline unsigned single_column_primary_key_restrictions<clustering_key>::num_prefix_columns_that_need_not_be_filtered() const {
|
||||||
|
column_id position = 0;
|
||||||
|
unsigned int count = 0;
|
||||||
|
for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
|
||||||
|
if (restriction->is_contains() || position != restriction->get_column_def().id) {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
if (!restriction->is_slice()) {
|
||||||
|
position = restriction->get_column_def().id + 1;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
inline unsigned single_column_primary_key_restrictions<partition_key>::num_prefix_columns_that_need_not_be_filtered() const {
|
||||||
|
// skip_filtering() is currently called only for clustering key
|
||||||
|
// restrictions, so it doesn't matter what we return here.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -214,11 +214,9 @@ statement_restrictions::statement_restrictions(database& db,
|
|||||||
}
|
}
|
||||||
auto& cf = db.find_column_family(schema);
|
auto& cf = db.find_column_family(schema);
|
||||||
auto& sim = cf.get_index_manager();
|
auto& sim = cf.get_index_manager();
|
||||||
bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
|
const bool has_queriable_clustering_column_index = _clustering_columns_restrictions->has_supporting_index(sim);
|
||||||
bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
|
const bool has_queriable_pk_index = _partition_key_restrictions->has_supporting_index(sim);
|
||||||
bool has_queriable_index = has_queriable_clustering_column_index
|
const bool has_queriable_regular_index = _nonprimary_key_restrictions->has_supporting_index(sim);
|
||||||
|| has_queriable_pk_index
|
|
||||||
|| _nonprimary_key_restrictions->has_supporting_index(sim);
|
|
||||||
|
|
||||||
// At this point, the select statement if fully constructed, but we still have a few things to validate
|
// At this point, the select statement if fully constructed, but we still have a few things to validate
|
||||||
process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
|
process_partition_key_restrictions(has_queriable_pk_index, for_view, allow_filtering);
|
||||||
@@ -279,7 +277,7 @@ statement_restrictions::statement_restrictions(database& db,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!_nonprimary_key_restrictions->empty()) {
|
if (!_nonprimary_key_restrictions->empty()) {
|
||||||
if (has_queriable_index) {
|
if (has_queriable_regular_index) {
|
||||||
_uses_secondary_indexing = true;
|
_uses_secondary_indexing = true;
|
||||||
} else if (!allow_filtering) {
|
} else if (!allow_filtering) {
|
||||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||||
@@ -337,6 +335,53 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
|
|||||||
return _index_restrictions;
|
return _index_restrictions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
|
||||||
|
for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
|
||||||
|
for (const auto& cdef : restriction->get_column_defs()) {
|
||||||
|
for (auto index : sim.list_indexes()) {
|
||||||
|
if (index.depends_on(*cdef)) {
|
||||||
|
return std::make_optional<secondary_index::index>(std::move(index));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
|
||||||
|
std::vector<const column_definition*> column_defs_for_filtering;
|
||||||
|
if (need_filtering()) {
|
||||||
|
auto& sim = db.find_column_family(_schema).get_index_manager();
|
||||||
|
std::optional<secondary_index::index> opt_idx = find_idx(sim);
|
||||||
|
auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
|
||||||
|
return opt_idx && opt_idx->depends_on(*cdef);
|
||||||
|
};
|
||||||
|
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||||
|
for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
|
||||||
|
if (!column_uses_indexing(cdef)) {
|
||||||
|
column_defs_for_filtering.emplace_back(cdef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const bool pk_has_unrestricted_components = _partition_key_restrictions->has_unrestricted_components(*_schema);
|
||||||
|
if (pk_has_unrestricted_components || _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||||
|
column_id first_filtering_id = pk_has_unrestricted_components ? 0 : _schema->clustering_key_columns().begin()->id +
|
||||||
|
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
|
||||||
|
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||||
|
if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
|
||||||
|
column_defs_for_filtering.emplace_back(cdef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
|
||||||
|
if (!column_uses_indexing(cdef)) {
|
||||||
|
column_defs_for_filtering.emplace_back(cdef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return column_defs_for_filtering;
|
||||||
|
}
|
||||||
|
|
||||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
|
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
|
||||||
// If there is a queriable index, no special condition are required on the other restrictions.
|
// If there is a queriable index, no special condition are required on the other restrictions.
|
||||||
// But we still need to know 2 things:
|
// But we still need to know 2 things:
|
||||||
@@ -435,10 +480,9 @@ bool statement_restrictions::need_filtering() const {
|
|||||||
int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
|
int number_of_filtering_restrictions = _nonprimary_key_restrictions->size();
|
||||||
// If the whole partition key is restricted, it does not imply filtering
|
// If the whole partition key is restricted, it does not imply filtering
|
||||||
if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
|
if (_partition_key_restrictions->has_unrestricted_components(*_schema) || !_partition_key_restrictions->is_all_eq()) {
|
||||||
number_of_filtering_restrictions += _partition_key_restrictions->size();
|
number_of_filtering_restrictions += _partition_key_restrictions->size() + _clustering_columns_restrictions->size();
|
||||||
if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
|
} else if (_clustering_columns_restrictions->has_unrestricted_components(*_schema)) {
|
||||||
number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
|
number_of_filtering_restrictions += _clustering_columns_restrictions->size() - _clustering_columns_restrictions->prefix_size();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
|
if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
|
||||||
|
|||||||
@@ -163,6 +163,20 @@ public:
|
|||||||
return _clustering_columns_restrictions;
|
return _clustering_columns_restrictions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a possibly empty collection of column definitions that will be used for filtering
|
||||||
|
* @param db - the database context
|
||||||
|
* @return A list with the column definitions needed for filtering.
|
||||||
|
*/
|
||||||
|
std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines the index to be used with the restriction.
|
||||||
|
* @param db - the database context (for extracting index manager)
|
||||||
|
* @return If an index can be used, an optional containing this index, otherwise an empty optional.
|
||||||
|
*/
|
||||||
|
std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the partition key has some unrestricted components.
|
* Checks if the partition key has some unrestricted components.
|
||||||
* @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
|
* @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
|
||||||
@@ -381,6 +395,14 @@ public:
|
|||||||
return !_nonprimary_key_restrictions->empty();
|
return !_nonprimary_key_restrictions->empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool pk_restrictions_need_filtering() const {
|
||||||
|
return _partition_key_restrictions->needs_filtering(*_schema);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ck_restrictions_need_filtering() const {
|
||||||
|
return _partition_key_restrictions->has_unrestricted_components(*_schema) || _clustering_columns_restrictions->needs_filtering(*_schema);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return true if column is restricted by some restriction, false otherwise
|
* @return true if column is restricted by some restriction, false otherwise
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -83,6 +83,9 @@ void metadata::maybe_set_paging_state(::shared_ptr<const service::pager::paging_
|
|||||||
assert(paging_state);
|
assert(paging_state);
|
||||||
if (paging_state->get_remaining() > 0) {
|
if (paging_state->get_remaining() > 0) {
|
||||||
set_paging_state(std::move(paging_state));
|
set_paging_state(std::move(paging_state));
|
||||||
|
} else {
|
||||||
|
_flags.remove<flag::HAS_MORE_PAGES>();
|
||||||
|
_paging_state = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ shared_ptr<selector::factory>
|
|||||||
selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
|
selectable::with_field_selection::new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) {
|
||||||
auto&& factory = _selected->new_selector_factory(db, s, defs);
|
auto&& factory = _selected->new_selector_factory(db, s, defs);
|
||||||
auto&& type = factory->new_instance()->get_type();
|
auto&& type = factory->new_instance()->get_type();
|
||||||
auto&& ut = dynamic_pointer_cast<const user_type_impl>(std::move(type));
|
auto&& ut = dynamic_pointer_cast<const user_type_impl>(type->underlying_type());
|
||||||
if (!ut) {
|
if (!ut) {
|
||||||
throw exceptions::invalid_request_exception(
|
throw exceptions::invalid_request_exception(
|
||||||
sprint("Invalid field selection: %s of type %s is not a user type",
|
sprint("Invalid field selection: %s of type %s is not a user type",
|
||||||
|
|||||||
@@ -156,9 +156,9 @@ public:
|
|||||||
return _factories->uses_function(ks_name, function_name);
|
return _factories->uses_function(ks_name, function_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual uint32_t add_column_for_ordering(const column_definition& c) override {
|
virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
|
||||||
uint32_t index = selection::add_column_for_ordering(c);
|
uint32_t index = selection::add_column_for_post_processing(c);
|
||||||
_factories->add_selector_for_ordering(c, index);
|
_factories->add_selector_for_post_processing(c, index);
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -227,7 +227,7 @@ protected:
|
|||||||
return simple_selection::make(schema, std::move(columns), false);
|
return simple_selection::make(schema, std::move(columns), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t selection::add_column_for_ordering(const column_definition& c) {
|
uint32_t selection::add_column_for_post_processing(const column_definition& c) {
|
||||||
_columns.push_back(&c);
|
_columns.push_back(&c);
|
||||||
_metadata->add_non_serialized_column(c.column_specification);
|
_metadata->add_non_serialized_column(c.column_specification);
|
||||||
return _columns.size() - 1;
|
return _columns.size() - 1;
|
||||||
@@ -339,14 +339,14 @@ std::unique_ptr<result_set> result_set_builder::build() {
|
|||||||
return std::move(_result_set);
|
return std::move(_result_set);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
|
||||||
const std::vector<bytes>& partition_key,
|
const std::vector<bytes>& partition_key,
|
||||||
const std::vector<bytes>& clustering_key,
|
const std::vector<bytes>& clustering_key,
|
||||||
const query::result_row_view& static_row,
|
const query::result_row_view& static_row,
|
||||||
const query::result_row_view& row) const {
|
const query::result_row_view& row) const {
|
||||||
static logging::logger rlogger("restrictions_filter");
|
static logging::logger rlogger("restrictions_filter");
|
||||||
|
|
||||||
if (_current_partition_key_does_not_match || _current_static_row_does_not_match) {
|
if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -427,6 +427,20 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||||
|
const std::vector<bytes>& partition_key,
|
||||||
|
const std::vector<bytes>& clustering_key,
|
||||||
|
const query::result_row_view& static_row,
|
||||||
|
const query::result_row_view& row) const {
|
||||||
|
const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
|
||||||
|
if (!accepted) {
|
||||||
|
++_rows_dropped;
|
||||||
|
} else if (_remaining > 0) {
|
||||||
|
--_remaining;
|
||||||
|
}
|
||||||
|
return accepted;
|
||||||
|
}
|
||||||
|
|
||||||
api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
|
api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
|
||||||
return _timestamps[idx];
|
return _timestamps[idx];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -176,7 +176,7 @@ public:
|
|||||||
static ::shared_ptr<selection> wildcard(schema_ptr schema);
|
static ::shared_ptr<selection> wildcard(schema_ptr schema);
|
||||||
static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);
|
static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);
|
||||||
|
|
||||||
virtual uint32_t add_column_for_ordering(const column_definition& c);
|
virtual uint32_t add_column_for_post_processing(const column_definition& c);
|
||||||
|
|
||||||
virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
|
virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
|
||||||
return false;
|
return false;
|
||||||
@@ -259,20 +259,31 @@ public:
|
|||||||
}
|
}
|
||||||
void reset() {
|
void reset() {
|
||||||
}
|
}
|
||||||
|
uint32_t get_rows_dropped() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
class restrictions_filter {
|
class restrictions_filter {
|
||||||
::shared_ptr<restrictions::statement_restrictions> _restrictions;
|
::shared_ptr<restrictions::statement_restrictions> _restrictions;
|
||||||
const query_options& _options;
|
const query_options& _options;
|
||||||
mutable bool _current_partition_key_does_not_match = false;
|
mutable bool _current_partition_key_does_not_match = false;
|
||||||
mutable bool _current_static_row_does_not_match = false;
|
mutable bool _current_static_row_does_not_match = false;
|
||||||
|
mutable uint32_t _rows_dropped = 0;
|
||||||
|
mutable uint32_t _remaining = 0;
|
||||||
public:
|
public:
|
||||||
restrictions_filter() = default;
|
restrictions_filter() = default;
|
||||||
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options) : _restrictions(restrictions), _options(options) {}
|
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
|
||||||
bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||||
void reset() {
|
void reset() {
|
||||||
_current_partition_key_does_not_match = false;
|
_current_partition_key_does_not_match = false;
|
||||||
_current_static_row_does_not_match = false;
|
_current_static_row_does_not_match = false;
|
||||||
|
_rows_dropped = 0;
|
||||||
}
|
}
|
||||||
|
uint32_t get_rows_dropped() const {
|
||||||
|
return _rows_dropped;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
|
result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
|
||||||
@@ -372,7 +383,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void accept_partition_end(const query::result_row_view& static_row) {
|
uint32_t accept_partition_end(const query::result_row_view& static_row) {
|
||||||
if (_row_count == 0) {
|
if (_row_count == 0) {
|
||||||
_builder.new_row();
|
_builder.new_row();
|
||||||
auto static_row_iterator = static_row.iterator();
|
auto static_row_iterator = static_row.iterator();
|
||||||
@@ -386,6 +397,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return _filter.get_rows_dropped();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
|
|||||||
: _contains_write_time_factory(false)
|
: _contains_write_time_factory(false)
|
||||||
, _contains_ttl_factory(false)
|
, _contains_ttl_factory(false)
|
||||||
, _number_of_aggregate_factories(0)
|
, _number_of_aggregate_factories(0)
|
||||||
|
, _number_of_factories_for_post_processing(0)
|
||||||
{
|
{
|
||||||
_factories.reserve(selectables.size());
|
_factories.reserve(selectables.size());
|
||||||
|
|
||||||
@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
|
void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
|
||||||
_factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
|
_factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
|
||||||
|
++_number_of_factories_for_post_processing;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
|
std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
|
||||||
|
|||||||
@@ -74,6 +74,11 @@ private:
|
|||||||
*/
|
*/
|
||||||
uint32_t _number_of_aggregate_factories;
|
uint32_t _number_of_aggregate_factories;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of factories that are only for post processing.
|
||||||
|
*/
|
||||||
|
uint32_t _number_of_factories_for_post_processing;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
|
* Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
|
||||||
@@ -97,11 +102,12 @@ public:
|
|||||||
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
|
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
|
||||||
|
* processing purposes.
|
||||||
* @param def the column that is needed for ordering
|
* @param def the column that is needed for ordering
|
||||||
* @param index the index of the column definition in the Selection's list of columns
|
* @param index the index of the column definition in the Selection's list of columns
|
||||||
*/
|
*/
|
||||||
void add_selector_for_ordering(const column_definition& def, uint32_t index);
|
void add_selector_for_post_processing(const column_definition& def, uint32_t index);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
|
* Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
|
||||||
@@ -111,7 +117,7 @@ public:
|
|||||||
*/
|
*/
|
||||||
bool contains_only_aggregate_functions() const {
|
bool contains_only_aggregate_functions() const {
|
||||||
auto size = _factories.size();
|
auto size = _factories.size();
|
||||||
return size != 0 && _number_of_aggregate_factories == size;
|
return size != 0 && _number_of_aggregate_factories == (size - _number_of_factories_for_post_processing);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -276,7 +276,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
|||||||
|
|
||||||
auto type = validate_alter(schema, *def, *validator);
|
auto type = validate_alter(schema, *def, *validator);
|
||||||
// In any case, we update the column definition
|
// In any case, we update the column definition
|
||||||
cfm.with_altered_column_type(column_name->name(), type);
|
cfm.alter_column_type(column_name->name(), type);
|
||||||
|
|
||||||
// We also have to validate the view types here. If we have a view which includes a column as part of
|
// We also have to validate the view types here. If we have a view which includes a column as part of
|
||||||
// the clustering key, we need to make sure that it is indeed compatible.
|
// the clustering key, we need to make sure that it is indeed compatible.
|
||||||
@@ -285,7 +285,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
|||||||
if (view_def) {
|
if (view_def) {
|
||||||
schema_builder builder(view);
|
schema_builder builder(view);
|
||||||
auto view_type = validate_alter(view, *view_def, *validator);
|
auto view_type = validate_alter(view, *view_def, *validator);
|
||||||
builder.with_altered_column_type(column_name->name(), std::move(view_type));
|
builder.alter_column_type(column_name->name(), std::move(view_type));
|
||||||
view_updates.push_back(view_ptr(builder.build()));
|
view_updates.push_back(view_ptr(builder.build()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -306,7 +306,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
|||||||
} else {
|
} else {
|
||||||
for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
|
for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
|
||||||
if (column_def.name() == column_name->name()) {
|
if (column_def.name() == column_name->name()) {
|
||||||
cfm.without_column(column_name->name());
|
cfm.remove_column(column_name->name());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -349,7 +349,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
|||||||
auto to = entry.second->prepare_column_identifier(schema);
|
auto to = entry.second->prepare_column_identifier(schema);
|
||||||
|
|
||||||
validate_column_rename(db, *schema, *from, *to);
|
validate_column_rename(db, *schema, *from, *to);
|
||||||
cfm.with_column_rename(from->name(), to->name());
|
cfm.rename_column(from->name(), to->name());
|
||||||
|
|
||||||
// If the view includes a renamed column, it must be renamed in
|
// If the view includes a renamed column, it must be renamed in
|
||||||
// the view table and the definition.
|
// the view table and the definition.
|
||||||
@@ -360,7 +360,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
|||||||
auto view_from = entry.first->prepare_column_identifier(view);
|
auto view_from = entry.first->prepare_column_identifier(view);
|
||||||
auto view_to = entry.second->prepare_column_identifier(view);
|
auto view_to = entry.second->prepare_column_identifier(view);
|
||||||
validate_column_rename(db, *view, *view_from, *view_to);
|
validate_column_rename(db, *view, *view_from, *view_to);
|
||||||
builder.with_column_rename(view_from->name(), view_to->name());
|
builder.rename_column(view_from->name(), view_to->name());
|
||||||
|
|
||||||
auto new_where = util::rename_column_in_where_clause(
|
auto new_where = util::rename_column_in_where_clause(
|
||||||
view->view_info()->where_clause(),
|
view->view_info()->where_clause(),
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
|
|||||||
if (t_opt) {
|
if (t_opt) {
|
||||||
modified = true;
|
modified = true;
|
||||||
// We need to update this column
|
// We need to update this column
|
||||||
cfm.with_altered_column_type(column.name(), *t_opt);
|
cfm.alter_column_type(column.name(), *t_opt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (modified) {
|
if (modified) {
|
||||||
@@ -165,7 +165,7 @@ alter_type_statement::add_or_alter::add_or_alter(const ut_name& name, bool is_ad
|
|||||||
user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
|
user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_update) const
|
||||||
{
|
{
|
||||||
if (get_idx_of_field(to_update, _field_name)) {
|
if (get_idx_of_field(to_update, _field_name)) {
|
||||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->name(), _name.to_string()));
|
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s to type %s: a field of the same name already exists", _field_name->to_string(), _name.to_string()));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<bytes> new_names(to_update->field_names());
|
std::vector<bytes> new_names(to_update->field_names());
|
||||||
@@ -173,7 +173,7 @@ user_type alter_type_statement::add_or_alter::do_add(database& db, user_type to_
|
|||||||
std::vector<data_type> new_types(to_update->field_types());
|
std::vector<data_type> new_types(to_update->field_types());
|
||||||
auto&& add_type = _field_type->prepare(db, keyspace())->get_type();
|
auto&& add_type = _field_type->prepare(db, keyspace())->get_type();
|
||||||
if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
|
if (add_type->references_user_type(to_update->_keyspace, to_update->_name)) {
|
||||||
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->name(), _field_type->to_string(), _name.to_string()));
|
throw exceptions::invalid_request_exception(sprint("Cannot add new field %s of type %s to type %s as this would create a circular reference", _field_name->to_string(), _field_type->to_string(), _name.to_string()));
|
||||||
}
|
}
|
||||||
new_types.push_back(std::move(add_type));
|
new_types.push_back(std::move(add_type));
|
||||||
return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
|
return user_type_impl::get_instance(to_update->_keyspace, to_update->_name, std::move(new_names), std::move(new_types));
|
||||||
@@ -183,13 +183,13 @@ user_type alter_type_statement::add_or_alter::do_alter(database& db, user_type t
|
|||||||
{
|
{
|
||||||
stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
|
stdx::optional<uint32_t> idx = get_idx_of_field(to_update, _field_name);
|
||||||
if (!idx) {
|
if (!idx) {
|
||||||
throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->name(), _name.to_string()));
|
throw exceptions::invalid_request_exception(sprint("Unknown field %s in type %s", _field_name->to_string(), _name.to_string()));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto previous = to_update->field_types()[*idx];
|
auto previous = to_update->field_types()[*idx];
|
||||||
auto new_type = _field_type->prepare(db, keyspace())->get_type();
|
auto new_type = _field_type->prepare(db, keyspace())->get_type();
|
||||||
if (!new_type->is_compatible_with(*previous)) {
|
if (!new_type->is_compatible_with(*previous)) {
|
||||||
throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->name(), _name.to_string()));
|
throw exceptions::invalid_request_exception(sprint("Type %s in incompatible with previous type %s of field %s in user type %s", _field_type->to_string(), previous->as_cql3_type()->to_string(), _field_name->to_string(), _name.to_string()));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<data_type> new_types(to_update->field_types());
|
std::vector<data_type> new_types(to_update->field_types());
|
||||||
|
|||||||
@@ -88,6 +88,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
|||||||
throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
|
throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (schema->is_dense()) {
|
||||||
|
throw exceptions::invalid_request_exception(
|
||||||
|
"Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<::shared_ptr<index_target>> targets;
|
std::vector<::shared_ptr<index_target>> targets;
|
||||||
for (auto& raw_target : _raw_targets) {
|
for (auto& raw_target : _raw_targets) {
|
||||||
targets.emplace_back(raw_target->prepare(schema));
|
targets.emplace_back(raw_target->prepare(schema));
|
||||||
@@ -109,6 +114,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
|||||||
sprint("No column definition found for column %s", *target->column));
|
sprint("No column definition found for column %s", *target->column));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//NOTICE(sarna): Should be lifted after resolving issue #2963
|
||||||
|
if (cd->is_static()) {
|
||||||
|
throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
|
||||||
|
}
|
||||||
|
|
||||||
if (cd->type->references_duration()) {
|
if (cd->type->references_duration()) {
|
||||||
using request_validations::check_false;
|
using request_validations::check_false;
|
||||||
const auto& ty = *cd->type;
|
const auto& ty = *cd->type;
|
||||||
@@ -122,8 +132,7 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Origin TODO: we could lift that limitation
|
// Origin TODO: we could lift that limitation
|
||||||
if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
|
if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
|
||||||
cd->kind != column_kind::regular_column) {
|
|
||||||
throw exceptions::invalid_request_exception(
|
throw exceptions::invalid_request_exception(
|
||||||
"Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
|
"Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
|
||||||
}
|
}
|
||||||
@@ -137,10 +146,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
|||||||
|
|
||||||
bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
|
bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
|
||||||
&& dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
|
&& dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
|
||||||
bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
|
bool is_collection = cd->type->is_collection();
|
||||||
|
bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();
|
||||||
|
|
||||||
if (is_frozen_collection) {
|
if (is_frozen_collection) {
|
||||||
validate_for_frozen_collection(target);
|
validate_for_frozen_collection(target);
|
||||||
|
} else if (is_collection) {
|
||||||
|
// NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
|
||||||
|
throw exceptions::invalid_request_exception(
|
||||||
|
sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
|
||||||
} else {
|
} else {
|
||||||
validate_not_full_index(target);
|
validate_not_full_index(target);
|
||||||
validate_is_values_index_if_target_column_not_collection(cd, target);
|
validate_is_values_index_if_target_column_not_collection(cd, target);
|
||||||
|
|||||||
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
|
|||||||
, _clustering_keys{clustering_keys}
|
, _clustering_keys{clustering_keys}
|
||||||
, _if_not_exists{if_not_exists}
|
, _if_not_exists{if_not_exists}
|
||||||
{
|
{
|
||||||
service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
|
|
||||||
if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
|
if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
|
||||||
throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
|
throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
|
||||||
}
|
}
|
||||||
@@ -315,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
|||||||
throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
|
throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The unique feature of a filter by a non-key column is that the
|
||||||
|
// value of such column can be updated - and also be expired with TTL
|
||||||
|
// and cause the view row to appear and disappear. We don't currently
|
||||||
|
// support support this case - see issue #3430, and neither does
|
||||||
|
// Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
|
||||||
|
// Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
|
||||||
|
// view row is now depending on multiple base columns (multiple filtered
|
||||||
|
// non-pk base column + base column used in view pk)". When the filtered
|
||||||
|
// column *is* the base column added to the view pk, we don't have this
|
||||||
|
// problem. And this case actually works correctly.
|
||||||
|
auto non_pk_restrictions = restrictions->get_non_pk_restriction();
|
||||||
|
if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
|
||||||
|
std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
|
||||||
|
// This case (filter by new PK column of the view) works, as explained above
|
||||||
|
} else if (!non_pk_restrictions.empty()) {
|
||||||
|
auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
|
||||||
|
throw exceptions::invalid_request_exception(sprint(
|
||||||
|
"Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
|
||||||
|
column_family(), column_names));
|
||||||
|
}
|
||||||
|
|
||||||
schema_builder builder{keyspace(), column_family()};
|
schema_builder builder{keyspace(), column_family()};
|
||||||
auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
|
auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
|
||||||
for (auto* def : defs) {
|
for (auto* def : defs) {
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
|
|||||||
property_definitions::validate(keywords);
|
property_definitions::validate(keywords);
|
||||||
|
|
||||||
if (is_custom && !custom_class) {
|
if (is_custom && !custom_class) {
|
||||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
|
throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_custom && custom_class) {
|
if (!is_custom && custom_class) {
|
||||||
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
|
|||||||
sprint("Cannot specify %s as a CUSTOM option",
|
sprint("Cannot specify %s as a CUSTOM option",
|
||||||
db::index::secondary_index::custom_index_option_name));
|
db::index::secondary_index::custom_index_option_name));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Currently, Scylla does not support *any* class of custom index
|
||||||
|
// implementation. If in the future we do (e.g., SASI, or something
|
||||||
|
// new), we'll need to check for valid values here.
|
||||||
|
if (is_custom && custom_class) {
|
||||||
|
throw exceptions::invalid_request_exception(
|
||||||
|
format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
|
||||||
|
*custom_class));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
index_options_map
|
index_options_map
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ private:
|
|||||||
::shared_ptr<attributes::raw> _attrs;
|
::shared_ptr<attributes::raw> _attrs;
|
||||||
::shared_ptr<term::raw> _json_value;
|
::shared_ptr<term::raw> _json_value;
|
||||||
bool _if_not_exists;
|
bool _if_not_exists;
|
||||||
|
bool _default_unset;
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* A parsed <code>INSERT JSON</code> statement.
|
* A parsed <code>INSERT JSON</code> statement.
|
||||||
@@ -95,7 +96,7 @@ public:
|
|||||||
* @param json_value JSON string representing names and values
|
* @param json_value JSON string representing names and values
|
||||||
* @param attrs additional attributes for statement (CL, timestamp, timeToLive)
|
* @param attrs additional attributes for statement (CL, timestamp, timeToLive)
|
||||||
*/
|
*/
|
||||||
insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists);
|
insert_json_statement(::shared_ptr<cf_name> name, ::shared_ptr<attributes::raw> attrs, ::shared_ptr<term::raw> json_value, bool if_not_exists, bool default_unset);
|
||||||
|
|
||||||
virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
|
virtual ::shared_ptr<cql3::statements::modification_statement> prepare_internal(database& db, schema_ptr schema,
|
||||||
::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
|
::shared_ptr<variable_specifications> bound_names, std::unique_ptr<attributes> attrs, cql_stats& stats) override;
|
||||||
|
|||||||
@@ -141,6 +141,10 @@ private:
|
|||||||
/** If ALLOW FILTERING was not specified, this verifies that it is not needed */
|
/** If ALLOW FILTERING was not specified, this verifies that it is not needed */
|
||||||
void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
|
void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||||
|
|
||||||
|
void ensure_filtering_columns_retrieval(database& db,
|
||||||
|
::shared_ptr<selection::selection> selection,
|
||||||
|
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||||
|
|
||||||
bool contains_alias(::shared_ptr<column_identifier> name);
|
bool contains_alias(::shared_ptr<column_identifier> name);
|
||||||
|
|
||||||
::shared_ptr<column_specification> limit_receiver();
|
::shared_ptr<column_specification> limit_receiver();
|
||||||
|
|||||||
@@ -383,8 +383,9 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
|||||||
int32_t limit = get_limit(options);
|
int32_t limit = get_limit(options);
|
||||||
auto now = gc_clock::now();
|
auto now = gc_clock::now();
|
||||||
|
|
||||||
|
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||||
++_stats.reads;
|
++_stats.reads;
|
||||||
_stats.filtered_reads += _restrictions->need_filtering();
|
_stats.filtered_reads += restrictions_need_filtering;
|
||||||
|
|
||||||
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
|
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
|
||||||
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
|
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
|
||||||
@@ -396,37 +397,41 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
|||||||
// An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
|
// An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
|
||||||
// If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
|
// If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
|
||||||
// Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
|
// Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
|
||||||
auto aggregate = _selection->is_aggregate();
|
const bool aggregate = _selection->is_aggregate();
|
||||||
if (aggregate && page_size <= 0) {
|
const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
|
||||||
|
if (aggregate || nonpaged_filtering) {
|
||||||
page_size = DEFAULT_COUNT_PAGE_SIZE;
|
page_size = DEFAULT_COUNT_PAGE_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto key_ranges = _restrictions->get_partition_key_ranges(options);
|
auto key_ranges = _restrictions->get_partition_key_ranges(options);
|
||||||
|
|
||||||
if (!aggregate && (page_size <= 0
|
if (!aggregate && !restrictions_need_filtering && (page_size <= 0
|
||||||
|| !service::pager::query_pagers::may_need_paging(*_schema, page_size,
|
|| !service::pager::query_pagers::may_need_paging(*_schema, page_size,
|
||||||
*command, key_ranges))) {
|
*command, key_ranges))) {
|
||||||
return execute(proxy, command, std::move(key_ranges), state, options, now);
|
return execute(proxy, command, std::move(key_ranges), state, options, now);
|
||||||
}
|
}
|
||||||
|
|
||||||
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
|
||||||
auto p = service::pager::query_pagers::pager(_schema, _selection,
|
auto p = service::pager::query_pagers::pager(_schema, _selection,
|
||||||
state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
|
state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);
|
||||||
|
|
||||||
if (aggregate) {
|
if (aggregate || nonpaged_filtering) {
|
||||||
return do_with(
|
return do_with(
|
||||||
cql3::selection::result_set_builder(*_selection, now,
|
cql3::selection::result_set_builder(*_selection, now,
|
||||||
options.get_cql_serialization_format()),
|
options.get_cql_serialization_format()),
|
||||||
[this, p, page_size, now, timeout](auto& builder) {
|
[this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
|
||||||
return do_until([p] {return p->is_exhausted();},
|
return do_until([p] {return p->is_exhausted();},
|
||||||
[p, &builder, page_size, now, timeout] {
|
[p, &builder, page_size, now, timeout_duration] {
|
||||||
|
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||||
return p->fetch_page(builder, page_size, now, timeout);
|
return p->fetch_page(builder, page_size, now, timeout);
|
||||||
}
|
}
|
||||||
).then([this, &builder] {
|
).then([this, &builder, restrictions_need_filtering] {
|
||||||
auto rs = builder.build();
|
auto rs = builder.build();
|
||||||
|
if (restrictions_need_filtering) {
|
||||||
|
_stats.filtered_rows_matched_total += rs->size();
|
||||||
|
}
|
||||||
update_stats_rows_read(rs->size());
|
update_stats_rows_read(rs->size());
|
||||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
|
||||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||||
});
|
});
|
||||||
@@ -439,7 +444,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
|||||||
" you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
|
" you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_selection->is_trivial() && !_restrictions->need_filtering()) {
|
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||||
|
if (_selection->is_trivial() && !restrictions_need_filtering) {
|
||||||
return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
|
return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
|
||||||
auto meta = [&] () -> shared_ptr<const cql3::metadata> {
|
auto meta = [&] () -> shared_ptr<const cql3::metadata> {
|
||||||
if (!p->is_exhausted()) {
|
if (!p->is_exhausted()) {
|
||||||
@@ -458,14 +464,16 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
|||||||
}
|
}
|
||||||
|
|
||||||
return p->fetch_page(page_size, now, timeout).then(
|
return p->fetch_page(page_size, now, timeout).then(
|
||||||
[this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
|
[this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {
|
||||||
|
|
||||||
if (!p->is_exhausted()) {
|
if (!p->is_exhausted()) {
|
||||||
rs->get_metadata().set_paging_state(p->state());
|
rs->get_metadata().set_paging_state(p->state());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (restrictions_need_filtering) {
|
||||||
|
_stats.filtered_rows_matched_total += rs->size();
|
||||||
|
}
|
||||||
update_stats_rows_read(rs->size());
|
update_stats_rows_read(rs->size());
|
||||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
|
||||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||||
});
|
});
|
||||||
@@ -492,15 +500,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
|
|||||||
return KeyType::from_range(exploded_base_key);
|
return KeyType::from_range(exploded_base_key);
|
||||||
}
|
}
|
||||||
|
|
||||||
future<shared_ptr<cql_transport::messages::result_message>>
|
lw_shared_ptr<query::read_command>
|
||||||
indexed_table_select_statement::execute_base_query(
|
indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
|
||||||
service::storage_proxy& proxy,
|
lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
|
||||||
dht::partition_range_vector&& partition_ranges,
|
|
||||||
service::query_state& state,
|
|
||||||
const query_options& options,
|
|
||||||
gc_clock::time_point now,
|
|
||||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
|
||||||
auto cmd = ::make_lw_shared<query::read_command>(
|
|
||||||
_schema->id(),
|
_schema->id(),
|
||||||
_schema->version(),
|
_schema->version(),
|
||||||
make_partition_slice(options),
|
make_partition_slice(options),
|
||||||
@@ -510,9 +512,25 @@ indexed_table_select_statement::execute_base_query(
|
|||||||
query::max_partitions,
|
query::max_partitions,
|
||||||
utils::UUID(),
|
utils::UUID(),
|
||||||
options.get_timestamp(state));
|
options.get_timestamp(state));
|
||||||
if (options.get_page_size() > 0) {
|
if (use_paging) {
|
||||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||||
|
cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
|
||||||
|
if (_schema->clustering_key_size() > 0) {
|
||||||
|
cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return cmd;
|
||||||
|
}
|
||||||
|
|
||||||
|
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||||
|
indexed_table_select_statement::do_execute_base_query(
|
||||||
|
service::storage_proxy& proxy,
|
||||||
|
dht::partition_range_vector&& partition_ranges,
|
||||||
|
service::query_state& state,
|
||||||
|
const query_options& options,
|
||||||
|
gc_clock::time_point now,
|
||||||
|
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||||
|
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||||
dht::partition_range_vector per_vnode_ranges;
|
dht::partition_range_vector per_vnode_ranges;
|
||||||
per_vnode_ranges.reserve(partition_ranges.size());
|
per_vnode_ranges.reserve(partition_ranges.size());
|
||||||
@@ -564,41 +582,34 @@ indexed_table_select_statement::execute_base_query(
|
|||||||
}).then([&merger]() {
|
}).then([&merger]() {
|
||||||
return merger.get();
|
return merger.get();
|
||||||
});
|
});
|
||||||
}).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
}).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||||
return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
|
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function for fetching the selected columns from a list of clustering rows.
|
|
||||||
// It is currently used only in our Secondary Index implementation - ordinary
|
|
||||||
// CQL SELECT statements do not have the syntax to request a list of rows.
|
|
||||||
// FIXME: The current implementation is very inefficient - it requests each
|
|
||||||
// row separately (and, incrementally, in parallel). Even multiple rows from a single
|
|
||||||
// partition are requested separately. This last case can be easily improved,
|
|
||||||
// but to implement the general case (multiple rows from multiple partitions)
|
|
||||||
// efficiently, we will need more support from other layers.
|
|
||||||
// Keys are ordered in token order (see #3423)
|
|
||||||
future<shared_ptr<cql_transport::messages::result_message>>
|
future<shared_ptr<cql_transport::messages::result_message>>
|
||||||
indexed_table_select_statement::execute_base_query(
|
indexed_table_select_statement::execute_base_query(
|
||||||
|
service::storage_proxy& proxy,
|
||||||
|
dht::partition_range_vector&& partition_ranges,
|
||||||
|
service::query_state& state,
|
||||||
|
const query_options& options,
|
||||||
|
gc_clock::time_point now,
|
||||||
|
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||||
|
return do_execute_base_query(proxy, std::move(partition_ranges), state, options, now, paging_state).then(
|
||||||
|
[this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
|
||||||
|
return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||||
|
indexed_table_select_statement::do_execute_base_query(
|
||||||
service::storage_proxy& proxy,
|
service::storage_proxy& proxy,
|
||||||
std::vector<primary_key>&& primary_keys,
|
std::vector<primary_key>&& primary_keys,
|
||||||
service::query_state& state,
|
service::query_state& state,
|
||||||
const query_options& options,
|
const query_options& options,
|
||||||
gc_clock::time_point now,
|
gc_clock::time_point now,
|
||||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||||
auto cmd = make_lw_shared<query::read_command>(
|
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||||
_schema->id(),
|
|
||||||
_schema->version(),
|
|
||||||
make_partition_slice(options),
|
|
||||||
get_limit(options),
|
|
||||||
now,
|
|
||||||
tracing::make_trace_info(state.get_trace_state()),
|
|
||||||
query::max_partitions,
|
|
||||||
utils::UUID(),
|
|
||||||
options.get_timestamp(state));
|
|
||||||
if (options.get_page_size() > 0) {
|
|
||||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
|
||||||
}
|
|
||||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||||
|
|
||||||
struct base_query_state {
|
struct base_query_state {
|
||||||
@@ -646,9 +657,23 @@ indexed_table_select_statement::execute_base_query(
|
|||||||
});
|
});
|
||||||
}).then([&merger] () {
|
}).then([&merger] () {
|
||||||
return merger.get();
|
return merger.get();
|
||||||
|
}).then([cmd] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
||||||
|
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>(std::move(result), std::move(cmd));
|
||||||
});
|
});
|
||||||
}).then([this, &proxy, &state, &options, now, cmd, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result) mutable {
|
});
|
||||||
return this->process_base_query_results(std::move(result), cmd, proxy, state, options, now, std::move(paging_state));
|
}
|
||||||
|
|
||||||
|
future<shared_ptr<cql_transport::messages::result_message>>
|
||||||
|
indexed_table_select_statement::execute_base_query(
|
||||||
|
service::storage_proxy& proxy,
|
||||||
|
std::vector<primary_key>&& primary_keys,
|
||||||
|
service::query_state& state,
|
||||||
|
const query_options& options,
|
||||||
|
gc_clock::time_point now,
|
||||||
|
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||||
|
return do_execute_base_query(proxy, std::move(primary_keys), state, options, now, paging_state).then(
|
||||||
|
[this, &proxy, &state, &options, now, paging_state = std::move(paging_state)] (foreign_ptr<lw_shared_ptr<query::result>> result, lw_shared_ptr<query::read_command> cmd) {
|
||||||
|
return process_base_query_results(std::move(result), std::move(cmd), proxy, state, options, now, std::move(paging_state));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -714,7 +739,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
|||||||
const query_options& options,
|
const query_options& options,
|
||||||
gc_clock::time_point now)
|
gc_clock::time_point now)
|
||||||
{
|
{
|
||||||
bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
|
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||||
|
const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
|
||||||
if (fast_path) {
|
if (fast_path) {
|
||||||
return make_shared<cql_transport::messages::result_message::rows>(result(
|
return make_shared<cql_transport::messages::result_message::rows>(result(
|
||||||
result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
|
result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
|
||||||
@@ -724,12 +750,12 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
|||||||
|
|
||||||
cql3::selection::result_set_builder builder(*_selection, now,
|
cql3::selection::result_set_builder builder(*_selection, now,
|
||||||
options.get_cql_serialization_format());
|
options.get_cql_serialization_format());
|
||||||
if (_restrictions->need_filtering()) {
|
if (restrictions_need_filtering) {
|
||||||
results->ensure_counts();
|
results->ensure_counts();
|
||||||
_stats.filtered_rows_read_total += *results->row_count();
|
_stats.filtered_rows_read_total += *results->row_count();
|
||||||
query::result_view::consume(*results, cmd->slice,
|
query::result_view::consume(*results, cmd->slice,
|
||||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||||
*_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options)));
|
*_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
|
||||||
} else {
|
} else {
|
||||||
query::result_view::consume(*results, cmd->slice,
|
query::result_view::consume(*results, cmd->slice,
|
||||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||||
@@ -745,7 +771,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
|||||||
rs->trim(cmd->row_limit);
|
rs->trim(cmd->row_limit);
|
||||||
}
|
}
|
||||||
update_stats_rows_read(rs->size());
|
update_stats_rows_read(rs->size());
|
||||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||||
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -774,7 +800,8 @@ indexed_table_select_statement::prepare(database& db,
|
|||||||
ordering_comparator_type ordering_comparator,
|
ordering_comparator_type ordering_comparator,
|
||||||
::shared_ptr<term> limit, cql_stats &stats)
|
::shared_ptr<term> limit, cql_stats &stats)
|
||||||
{
|
{
|
||||||
auto index_opt = find_idx(db, schema, restrictions);
|
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||||
|
auto index_opt = restrictions->find_idx(sim);
|
||||||
if (!index_opt) {
|
if (!index_opt) {
|
||||||
throw std::runtime_error("No index found.");
|
throw std::runtime_error("No index found.");
|
||||||
}
|
}
|
||||||
@@ -798,24 +825,6 @@ indexed_table_select_statement::prepare(database& db,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
|
|
||||||
schema_ptr schema,
|
|
||||||
::shared_ptr<restrictions::statement_restrictions> restrictions)
|
|
||||||
{
|
|
||||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
|
||||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
|
|
||||||
for (const auto& cdef : restriction->get_column_defs()) {
|
|
||||||
for (auto index : sim.list_indexes()) {
|
|
||||||
if (index.depends_on(*cdef)) {
|
|
||||||
return stdx::make_optional<secondary_index::index>(std::move(index));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return stdx::nullopt;
|
|
||||||
}
|
|
||||||
|
|
||||||
indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
|
indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
|
||||||
::shared_ptr<parameters> parameters,
|
::shared_ptr<parameters> parameters,
|
||||||
::shared_ptr<selection::selection> selection,
|
::shared_ptr<selection::selection> selection,
|
||||||
@@ -882,7 +891,6 @@ static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_
|
|||||||
auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
||||||
paging_state_copy->set_partition_key(std::move(index_pk));
|
paging_state_copy->set_partition_key(std::move(index_pk));
|
||||||
paging_state_copy->set_clustering_key(std::move(index_ck));
|
paging_state_copy->set_clustering_key(std::move(index_ck));
|
||||||
paging_state_copy->set_remaining(query::max_rows);
|
|
||||||
return std::move(paging_state_copy);
|
return std::move(paging_state_copy);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -940,6 +948,60 @@ indexed_table_select_statement::do_execute(service::storage_proxy& proxy,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Aggregated and paged filtering needs to aggregate the results from all pages
|
||||||
|
// in order to avoid returning partial per-page results (issue #4540).
|
||||||
|
// It's a little bit more complicated than regular aggregation, because each paging state
|
||||||
|
// needs to be translated between the base table and the underlying view.
|
||||||
|
// The routine below keeps fetching pages from the underlying view, which are then
|
||||||
|
// used to fetch base rows, which go straight to the result set builder.
|
||||||
|
// A local, internal copy of query_options is kept in order to keep updating
|
||||||
|
// the paging state between requesting data from replicas.
|
||||||
|
const bool aggregate = _selection->is_aggregate();
|
||||||
|
if (aggregate) {
|
||||||
|
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||||
|
return do_with(cql3::selection::result_set_builder(*_selection, now, options.get_cql_serialization_format()), std::make_unique<cql3::query_options>(cql3::query_options(options)),
|
||||||
|
[this, &options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] (cql3::selection::result_set_builder& builder, std::unique_ptr<cql3::query_options>& internal_options) {
|
||||||
|
// page size is set to the internal count page size, regardless of the user-provided value
|
||||||
|
internal_options.reset(new cql3::query_options(std::move(internal_options), options.get_paging_state(), DEFAULT_COUNT_PAGE_SIZE));
|
||||||
|
return repeat([this, &builder, &options, &internal_options, &proxy, &state, now, whole_partitions, partition_slices, restrictions_need_filtering] () {
|
||||||
|
auto consume_results = [this, &builder, &options, &internal_options, restrictions_need_filtering] (foreign_ptr<lw_shared_ptr<query::result>> results, lw_shared_ptr<query::read_command> cmd) {
|
||||||
|
if (restrictions_need_filtering) {
|
||||||
|
query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
|
||||||
|
cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
|
||||||
|
} else {
|
||||||
|
query::result_view::consume(*results, cmd->slice, cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (whole_partitions || partition_slices) {
|
||||||
|
return find_index_partition_ranges(proxy, state, *internal_options).then(
|
||||||
|
[this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (dht::partition_range_vector partition_ranges, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||||
|
bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
|
||||||
|
internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
|
||||||
|
return do_execute_base_query(proxy, std::move(partition_ranges), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
|
||||||
|
return stop_iteration(!has_more_pages);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
return find_index_clustering_rows(proxy, state, *internal_options).then(
|
||||||
|
[this, now, &state, &internal_options, &proxy, consume_results = std::move(consume_results)] (std::vector<primary_key> primary_keys, ::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||||
|
bool has_more_pages = paging_state && paging_state->get_remaining() > 0;
|
||||||
|
internal_options.reset(new cql3::query_options(std::move(internal_options), paging_state ? ::make_shared<service::pager::paging_state>(*paging_state) : nullptr));
|
||||||
|
return this->do_execute_base_query(proxy, std::move(primary_keys), state, *internal_options, now, std::move(paging_state)).then(consume_results).then([has_more_pages] {
|
||||||
|
return stop_iteration(!has_more_pages);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}).then([this, &builder, restrictions_need_filtering] () {
|
||||||
|
auto rs = builder.build();
|
||||||
|
update_stats_rows_read(rs->size());
|
||||||
|
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||||
|
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||||
|
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (whole_partitions || partition_slices) {
|
if (whole_partitions || partition_slices) {
|
||||||
// In this case, can use our normal query machinery, which retrieves
|
// In this case, can use our normal query machinery, which retrieves
|
||||||
// entire partitions or the same slice for many partitions.
|
// entire partitions or the same slice for many partitions.
|
||||||
@@ -1219,6 +1281,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
|
|||||||
}
|
}
|
||||||
|
|
||||||
check_needs_filtering(restrictions);
|
check_needs_filtering(restrictions);
|
||||||
|
ensure_filtering_columns_retrieval(db, selection, restrictions);
|
||||||
|
|
||||||
::shared_ptr<cql3::statements::select_statement> stmt;
|
::shared_ptr<cql3::statements::select_statement> stmt;
|
||||||
if (restrictions->uses_secondary_indexing()) {
|
if (restrictions->uses_secondary_indexing()) {
|
||||||
@@ -1357,7 +1420,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
|
|||||||
}
|
}
|
||||||
auto index = selection->index_of(*def);
|
auto index = selection->index_of(*def);
|
||||||
if (index < 0) {
|
if (index < 0) {
|
||||||
index = selection->add_column_for_ordering(*def);
|
index = selection->add_column_for_post_processing(*def);
|
||||||
}
|
}
|
||||||
|
|
||||||
sorters.emplace_back(index, def->type);
|
sorters.emplace_back(index, def->type);
|
||||||
@@ -1444,6 +1507,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds columns that are needed for the purpose of filtering to the selection.
|
||||||
|
* The columns that are added to the selection are columns that
|
||||||
|
* are needed for filtering on the coordinator but are not part of the selection.
|
||||||
|
* The columns are added with a meta-data indicating they are not to be returned
|
||||||
|
* to the user.
|
||||||
|
*/
|
||||||
|
void select_statement::ensure_filtering_columns_retrieval(database& db,
|
||||||
|
::shared_ptr<selection::selection> selection,
|
||||||
|
::shared_ptr<restrictions::statement_restrictions> restrictions) {
|
||||||
|
for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
|
||||||
|
if (!selection->has_column(*cdef)) {
|
||||||
|
selection->add_column_for_post_processing(*cdef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
|
bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
|
||||||
return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
|
return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
|
||||||
return raw->alias && *name == *raw->alias;
|
return raw->alias && *name == *raw->alias;
|
||||||
|
|||||||
@@ -67,8 +67,8 @@ class select_statement : public cql_statement {
|
|||||||
public:
|
public:
|
||||||
using parameters = raw::select_statement::parameters;
|
using parameters = raw::select_statement::parameters;
|
||||||
using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
|
using ordering_comparator_type = raw::select_statement::ordering_comparator_type;
|
||||||
protected:
|
|
||||||
static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
|
static constexpr int DEFAULT_COUNT_PAGE_SIZE = 10000;
|
||||||
|
protected:
|
||||||
static thread_local const ::shared_ptr<parameters> _default_parameters;
|
static thread_local const ::shared_ptr<parameters> _default_parameters;
|
||||||
schema_ptr _schema;
|
schema_ptr _schema;
|
||||||
uint32_t _bound_terms;
|
uint32_t _bound_terms;
|
||||||
@@ -186,10 +186,6 @@ public:
|
|||||||
schema_ptr view_schema);
|
schema_ptr view_schema);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static stdx::optional<secondary_index::index> find_idx(database& db,
|
|
||||||
schema_ptr schema,
|
|
||||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
|
||||||
|
|
||||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
|
virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
|
||||||
service::query_state& state, const query_options& options) override;
|
service::query_state& state, const query_options& options) override;
|
||||||
|
|
||||||
@@ -214,6 +210,17 @@ private:
|
|||||||
gc_clock::time_point now,
|
gc_clock::time_point now,
|
||||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||||
|
|
||||||
|
lw_shared_ptr<query::read_command>
|
||||||
|
prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
|
||||||
|
|
||||||
|
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||||
|
do_execute_base_query(
|
||||||
|
service::storage_proxy& proxy,
|
||||||
|
dht::partition_range_vector&& partition_ranges,
|
||||||
|
service::query_state& state,
|
||||||
|
const query_options& options,
|
||||||
|
gc_clock::time_point now,
|
||||||
|
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||||
future<shared_ptr<cql_transport::messages::result_message>>
|
future<shared_ptr<cql_transport::messages::result_message>>
|
||||||
execute_base_query(
|
execute_base_query(
|
||||||
service::storage_proxy& proxy,
|
service::storage_proxy& proxy,
|
||||||
@@ -223,6 +230,23 @@ private:
|
|||||||
gc_clock::time_point now,
|
gc_clock::time_point now,
|
||||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||||
|
|
||||||
|
// Function for fetching the selected columns from a list of clustering rows.
|
||||||
|
// It is currently used only in our Secondary Index implementation - ordinary
|
||||||
|
// CQL SELECT statements do not have the syntax to request a list of rows.
|
||||||
|
// FIXME: The current implementation is very inefficient - it requests each
|
||||||
|
// row separately (and, incrementally, in parallel). Even multiple rows from a single
|
||||||
|
// partition are requested separately. This last case can be easily improved,
|
||||||
|
// but to implement the general case (multiple rows from multiple partitions)
|
||||||
|
// efficiently, we will need more support from other layers.
|
||||||
|
// Keys are ordered in token order (see #3423)
|
||||||
|
future<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>
|
||||||
|
do_execute_base_query(
|
||||||
|
service::storage_proxy& proxy,
|
||||||
|
std::vector<primary_key>&& primary_keys,
|
||||||
|
service::query_state& state,
|
||||||
|
const query_options& options,
|
||||||
|
gc_clock::time_point now,
|
||||||
|
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||||
future<shared_ptr<cql_transport::messages::result_message>>
|
future<shared_ptr<cql_transport::messages::result_message>>
|
||||||
execute_base_query(
|
execute_base_query(
|
||||||
service::storage_proxy& proxy,
|
service::storage_proxy& proxy,
|
||||||
|
|||||||
@@ -84,8 +84,11 @@ parse(const sstring& json_string, const std::vector<column_definition>& expected
|
|||||||
for (const auto& def : expected_receivers) {
|
for (const auto& def : expected_receivers) {
|
||||||
sstring cql_name = def.name_as_text();
|
sstring cql_name = def.name_as_text();
|
||||||
auto value_it = prepared_map.find(cql_name);
|
auto value_it = prepared_map.find(cql_name);
|
||||||
if (value_it == prepared_map.end() || value_it->second.isNull()) {
|
if (value_it == prepared_map.end()) {
|
||||||
|
continue;
|
||||||
|
} else if (value_it->second.isNull()) {
|
||||||
json_map.emplace(std::move(cql_name), bytes_opt{});
|
json_map.emplace(std::move(cql_name), bytes_opt{});
|
||||||
|
prepared_map.erase(value_it);
|
||||||
} else {
|
} else {
|
||||||
json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
|
json_map.emplace(std::move(cql_name), def.type->from_json_object(value_it->second, sf));
|
||||||
prepared_map.erase(value_it);
|
prepared_map.erase(value_it);
|
||||||
@@ -255,8 +258,12 @@ void insert_prepared_json_statement::execute_operations_for_key(mutation& m, con
|
|||||||
throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
|
throw exceptions::invalid_request_exception(sprint("Cannot set the value of counter column %s in JSON", def.name_as_text()));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto value = json_cache->at(def.name_as_text());
|
auto it = json_cache->find(def.name_as_text());
|
||||||
execute_set_value(m, prefix, params, def, value);
|
if (it != json_cache->end()) {
|
||||||
|
execute_set_value(m, prefix, params, def, it->second);
|
||||||
|
} else if (!_default_unset) {
|
||||||
|
execute_set_value(m, prefix, params, def, bytes_opt{});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -322,12 +329,14 @@ insert_statement::prepare_internal(database& db, schema_ptr schema,
|
|||||||
insert_json_statement::insert_json_statement( ::shared_ptr<cf_name> name,
|
insert_json_statement::insert_json_statement( ::shared_ptr<cf_name> name,
|
||||||
::shared_ptr<attributes::raw> attrs,
|
::shared_ptr<attributes::raw> attrs,
|
||||||
::shared_ptr<term::raw> json_value,
|
::shared_ptr<term::raw> json_value,
|
||||||
bool if_not_exists)
|
bool if_not_exists,
|
||||||
|
bool default_unset)
|
||||||
: raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
|
: raw::modification_statement{name, attrs, conditions_vector{}, if_not_exists, false}
|
||||||
, _name(name)
|
, _name(name)
|
||||||
, _attrs(attrs)
|
, _attrs(attrs)
|
||||||
, _json_value(json_value)
|
, _json_value(json_value)
|
||||||
, _if_not_exists(if_not_exists) { }
|
, _if_not_exists(if_not_exists)
|
||||||
|
, _default_unset(default_unset) { }
|
||||||
|
|
||||||
::shared_ptr<cql3::statements::modification_statement>
|
::shared_ptr<cql3::statements::modification_statement>
|
||||||
insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
||||||
@@ -337,7 +346,7 @@ insert_json_statement::prepare_internal(database& db, schema_ptr schema,
|
|||||||
auto json_column_placeholder = ::make_shared<column_identifier>("", true);
|
auto json_column_placeholder = ::make_shared<column_identifier>("", true);
|
||||||
auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
|
auto prepared_json_value = _json_value->prepare(db, "", ::make_shared<column_specification>("", "", json_column_placeholder, utf8_type));
|
||||||
prepared_json_value->collect_marker_specification(bound_names);
|
prepared_json_value->collect_marker_specification(bound_names);
|
||||||
return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value));
|
return ::make_shared<cql3::statements::insert_prepared_json_statement>(bound_names->size(), schema, std::move(attrs), &stats.inserts, std::move(prepared_json_value), _default_unset);
|
||||||
}
|
}
|
||||||
|
|
||||||
update_statement::update_statement( ::shared_ptr<cf_name> name,
|
update_statement::update_statement( ::shared_ptr<cf_name> name,
|
||||||
|
|||||||
@@ -82,9 +82,10 @@ private:
|
|||||||
*/
|
*/
|
||||||
class insert_prepared_json_statement : public update_statement {
|
class insert_prepared_json_statement : public update_statement {
|
||||||
::shared_ptr<term> _term;
|
::shared_ptr<term> _term;
|
||||||
|
bool _default_unset;
|
||||||
public:
|
public:
|
||||||
insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t)
|
insert_prepared_json_statement(uint32_t bound_terms, schema_ptr s, std::unique_ptr<attributes> attrs, uint64_t* cql_stats_counter_ptr, ::shared_ptr<term> t, bool default_unset)
|
||||||
: update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t) {
|
: update_statement(statement_type::INSERT, bound_terms, s, std::move(attrs), cql_stats_counter_ptr), _term(t), _default_unset(default_unset) {
|
||||||
_restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
|
_restrictions = ::make_shared<restrictions::statement_restrictions>(s, false);
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ public:
|
|||||||
column->ks_name,
|
column->ks_name,
|
||||||
column->cf_name,
|
column->cf_name,
|
||||||
::make_shared<column_identifier>(sprint("%s[%d]", column->name, component), true),
|
::make_shared<column_identifier>(sprint("%s[%d]", column->name, component), true),
|
||||||
static_pointer_cast<const tuple_type_impl>(column->type)->type(component));
|
static_pointer_cast<const tuple_type_impl>(column->type->underlying_type())->type(component));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -112,7 +112,7 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
|
void validate_assignable_to(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) {
|
||||||
auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type);
|
auto tt = dynamic_pointer_cast<const tuple_type_impl>(receiver->type->underlying_type());
|
||||||
if (!tt) {
|
if (!tt) {
|
||||||
throw exceptions::invalid_request_exception(sprint("Invalid tuple type literal for %s of type %s", receiver->name, receiver->type->as_cql3_type()));
|
throw exceptions::invalid_request_exception(sprint("Invalid tuple type literal for %s of type %s", receiver->name, receiver->type->as_cql3_type()));
|
||||||
}
|
}
|
||||||
|
|||||||
197
database.cc
197
database.cc
@@ -76,6 +76,8 @@
|
|||||||
#include "sstables/compaction_manager.hh"
|
#include "sstables/compaction_manager.hh"
|
||||||
#include "sstables/compaction_backlog_manager.hh"
|
#include "sstables/compaction_backlog_manager.hh"
|
||||||
#include "sstables/progress_monitor.hh"
|
#include "sstables/progress_monitor.hh"
|
||||||
|
#include "auth/common.hh"
|
||||||
|
#include "tracing/trace_keyspace_helper.hh"
|
||||||
|
|
||||||
#include "checked-file-impl.hh"
|
#include "checked-file-impl.hh"
|
||||||
#include "disk-error-handler.hh"
|
#include "disk-error-handler.hh"
|
||||||
@@ -178,6 +180,18 @@ bool is_system_keyspace(const sstring& name) {
|
|||||||
return system_keyspaces.find(name) != system_keyspaces.end();
|
return system_keyspaces.find(name) != system_keyspaces.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const std::unordered_set<sstring> internal_keyspaces = {
|
||||||
|
db::system_distributed_keyspace::NAME,
|
||||||
|
db::system_keyspace::NAME,
|
||||||
|
db::schema_tables::NAME,
|
||||||
|
auth::meta::AUTH_KS,
|
||||||
|
tracing::trace_keyspace_helper::KEYSPACE_NAME
|
||||||
|
};
|
||||||
|
|
||||||
|
bool is_internal_keyspace(const sstring& name) {
|
||||||
|
return internal_keyspaces.find(name) != internal_keyspaces.end();
|
||||||
|
}
|
||||||
|
|
||||||
// Used for tests where the CF exists without a database object. We need to pass a valid
|
// Used for tests where the CF exists without a database object. We need to pass a valid
|
||||||
// dirty_memory manager in that case.
|
// dirty_memory manager in that case.
|
||||||
thread_local dirty_memory_manager default_dirty_memory_manager;
|
thread_local dirty_memory_manager default_dirty_memory_manager;
|
||||||
@@ -684,9 +698,11 @@ table::make_reader(schema_ptr s,
|
|||||||
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
|
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
|
||||||
}
|
}
|
||||||
|
|
||||||
sstables::shared_sstable
|
sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<sstring> subdir) {
|
||||||
table::make_streaming_sstable_for_write() {
|
|
||||||
sstring dir = _config.datadir;
|
sstring dir = _config.datadir;
|
||||||
|
if (subdir) {
|
||||||
|
dir += "/" + *subdir;
|
||||||
|
}
|
||||||
auto newtab = sstables::make_sstable(_schema,
|
auto newtab = sstables::make_sstable(_schema,
|
||||||
dir, calculate_generation_for_new_table(),
|
dir, calculate_generation_for_new_table(),
|
||||||
get_highest_supported_format(),
|
get_highest_supported_format(),
|
||||||
@@ -826,7 +842,11 @@ void table::add_sstable(sstables::shared_sstable sstable, const std::vector<unsi
|
|||||||
new_sstables->insert(sstable);
|
new_sstables->insert(sstable);
|
||||||
_sstables = std::move(new_sstables);
|
_sstables = std::move(new_sstables);
|
||||||
update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
|
update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
|
||||||
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
|
if (sstable->is_staging()) {
|
||||||
|
_sstables_staging.emplace(sstable->generation(), sstable);
|
||||||
|
} else {
|
||||||
|
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
future<>
|
future<>
|
||||||
@@ -1082,12 +1102,14 @@ table::start() {
|
|||||||
future<>
|
future<>
|
||||||
table::stop() {
|
table::stop() {
|
||||||
return _async_gate.close().then([this] {
|
return _async_gate.close().then([this] {
|
||||||
return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
|
return when_all(await_pending_writes(), await_pending_reads()).discard_result().finally([this] {
|
||||||
return _compaction_manager.remove(this).then([this] {
|
return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
|
||||||
// Nest, instead of using when_all, so we don't lose any exceptions.
|
return _compaction_manager.remove(this).then([this] {
|
||||||
return _streaming_flush_gate.close();
|
// Nest, instead of using when_all, so we don't lose any exceptions.
|
||||||
}).then([this] {
|
return _streaming_flush_gate.close();
|
||||||
return _sstable_deletion_gate.close();
|
}).then([this] {
|
||||||
|
return _sstable_deletion_gate.close();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -1346,6 +1368,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
|
|||||||
|
|
||||||
// This is done in the background, so we can consider this compaction completed.
|
// This is done in the background, so we can consider this compaction completed.
|
||||||
seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
|
seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
|
||||||
|
return with_semaphore(_sstable_deletion_sem, 1, [this, sstables_to_remove = std::move(sstables_to_remove)] {
|
||||||
return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
|
return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
|
||||||
std::exception_ptr eptr;
|
std::exception_ptr eptr;
|
||||||
try {
|
try {
|
||||||
@@ -1369,6 +1392,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
|
|||||||
return make_exception_future<>(eptr);
|
return make_exception_future<>(eptr);
|
||||||
}
|
}
|
||||||
return make_ready_future<>();
|
return make_ready_future<>();
|
||||||
|
});
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
// refresh underlying data source in row cache to prevent it from holding reference
|
// refresh underlying data source in row cache to prevent it from holding reference
|
||||||
// to sstables files which were previously deleted.
|
// to sstables files which were previously deleted.
|
||||||
@@ -1489,7 +1513,8 @@ future<> table::cleanup_sstables(sstables::compaction_descriptor descriptor) {
|
|||||||
return with_semaphore(sem, 1, [this, &sst] {
|
return with_semaphore(sem, 1, [this, &sst] {
|
||||||
// release reference to sstables cleaned up, otherwise space usage from their data and index
|
// release reference to sstables cleaned up, otherwise space usage from their data and index
|
||||||
// components cannot be reclaimed until all of them are cleaned.
|
// components cannot be reclaimed until all of them are cleaned.
|
||||||
return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sst->get_sstable_level()), true);
|
auto sstable_level = sst->get_sstable_level();
|
||||||
|
return this->compact_sstables(sstables::compaction_descriptor({ std::move(sst) }, sstable_level), true);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -1613,7 +1638,9 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio
|
|||||||
|
|
||||||
std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
|
std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
|
||||||
return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
|
return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
|
||||||
| boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
|
| boost::adaptors::filtered([this] (auto& sst) {
|
||||||
|
return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
|
std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
|
||||||
@@ -1671,9 +1698,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
|
|||||||
// to distribute evenly the resource usage among all shards.
|
// to distribute evenly the resource usage among all shards.
|
||||||
|
|
||||||
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
|
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
|
||||||
[&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
|
[&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {
|
||||||
|
|
||||||
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
|
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
|
||||||
auto& cf = local.find_column_family(comps.ks, comps.cf);
|
auto& cf = local.find_column_family(comps.ks, comps.cf);
|
||||||
|
|
||||||
auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
|
auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
|
||||||
@@ -1969,6 +1996,12 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
|
|||||||
}
|
}
|
||||||
auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
|
auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
|
||||||
cf.update_sstables_known_generation(comps.generation);
|
cf.update_sstables_known_generation(comps.generation);
|
||||||
|
if (shared_sstable sst = cf.get_staging_sstable(comps.generation)) {
|
||||||
|
dblog.warn("SSTable {} is already present in staging/ directory. Moving from staging will be retried.", sst->get_filename());
|
||||||
|
return seastar::async([sst = std::move(sst), comps = std::move(comps)] () {
|
||||||
|
sst->move_to_new_dir_in_thread(comps.sstdir, comps.generation);
|
||||||
|
});
|
||||||
|
}
|
||||||
{
|
{
|
||||||
auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
|
auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
|
||||||
if (i != cf._sstables->all()->end()) {
|
if (i != cf._sstables->all()->end()) {
|
||||||
@@ -2154,9 +2187,6 @@ database::database(const db::config& cfg, database_config dbcfg)
|
|||||||
[this] {
|
[this] {
|
||||||
++_stats->sstable_read_queue_overloaded;
|
++_stats->sstable_read_queue_overloaded;
|
||||||
return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
|
return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
|
||||||
},
|
|
||||||
[this] {
|
|
||||||
return _querier_cache.evict_one();
|
|
||||||
})
|
})
|
||||||
// No timeouts or queue length limits - a failure here can kill an entire repair.
|
// No timeouts or queue length limits - a failure here can kill an entire repair.
|
||||||
// Trust the caller to limit concurrency.
|
// Trust the caller to limit concurrency.
|
||||||
@@ -2168,12 +2198,11 @@ database::database(const db::config& cfg, database_config dbcfg)
|
|||||||
, _version(empty_version)
|
, _version(empty_version)
|
||||||
, _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
|
, _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
|
||||||
, _enable_incremental_backups(cfg.incremental_backups())
|
, _enable_incremental_backups(cfg.incremental_backups())
|
||||||
, _querier_cache(dbcfg.available_memory * 0.04)
|
, _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
|
||||||
, _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
|
, _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
|
||||||
, _result_memory_limiter(dbcfg.available_memory / 10)
|
, _result_memory_limiter(dbcfg.available_memory / 10)
|
||||||
{
|
{
|
||||||
local_schema_registry().init(*this); // TODO: we're never unbound.
|
local_schema_registry().init(*this); // TODO: we're never unbound.
|
||||||
_compaction_manager->start();
|
|
||||||
setup_metrics();
|
setup_metrics();
|
||||||
|
|
||||||
_row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
|
_row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
|
||||||
@@ -2204,6 +2233,10 @@ void backlog_controller::adjust() {
|
|||||||
|
|
||||||
float backlog_controller::backlog_of_shares(float shares) const {
|
float backlog_controller::backlog_of_shares(float shares) const {
|
||||||
size_t idx = 1;
|
size_t idx = 1;
|
||||||
|
// No control points means the controller is disabled.
|
||||||
|
if (_control_points.size() == 0) {
|
||||||
|
return 1.0f;
|
||||||
|
}
|
||||||
while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
|
while ((idx < _control_points.size() - 1) && (_control_points[idx].output < shares)) {
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
@@ -2299,6 +2332,9 @@ database::setup_metrics() {
|
|||||||
sm::description("Counts sstables that survived the clustering key filtering. "
|
sm::description("Counts sstables that survived the clustering key filtering. "
|
||||||
"High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),
|
"High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),
|
||||||
|
|
||||||
|
sm::make_derive("dropped_view_updates", _cf_stats.dropped_view_updates,
|
||||||
|
sm::description("Counts the number of view updates that have been dropped due to cluster overload. ")),
|
||||||
|
|
||||||
sm::make_derive("total_writes", _stats->total_writes,
|
sm::make_derive("total_writes", _stats->total_writes,
|
||||||
sm::description("Counts the total number of successful write operations performed by this shard.")),
|
sm::description("Counts the total number of successful write operations performed by this shard.")),
|
||||||
|
|
||||||
@@ -2316,6 +2352,9 @@ database::setup_metrics() {
|
|||||||
sm::description("Counts the total number of failed read operations. "
|
sm::description("Counts the total number of failed read operations. "
|
||||||
"Add the total_reads to this value to get the total amount of reads issued on this shard.")),
|
"Add the total_reads to this value to get the total amount of reads issued on this shard.")),
|
||||||
|
|
||||||
|
sm::make_current_bytes("view_update_backlog", [this] { return get_view_update_backlog().current; },
|
||||||
|
sm::description("Holds the current size in bytes of the pending view updates for all tables")),
|
||||||
|
|
||||||
sm::make_derive("querier_cache_lookups", _querier_cache.get_stats().lookups,
|
sm::make_derive("querier_cache_lookups", _querier_cache.get_stats().lookups,
|
||||||
sm::description("Counts querier cache lookups (paging queries)")),
|
sm::description("Counts querier cache lookups (paging queries)")),
|
||||||
|
|
||||||
@@ -2420,6 +2459,9 @@ database::setup_metrics() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
database::~database() {
|
database::~database() {
|
||||||
|
_read_concurrency_sem.clear_inactive_reads();
|
||||||
|
_streaming_concurrency_sem.clear_inactive_reads();
|
||||||
|
_system_read_concurrency_sem.clear_inactive_reads();
|
||||||
}
|
}
|
||||||
|
|
||||||
void database::update_version(const utils::UUID& version) {
|
void database::update_version(const utils::UUID& version) {
|
||||||
@@ -2450,6 +2492,8 @@ future<> distributed_loader::populate_keyspace(distributed<database>& db, sstrin
|
|||||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||||
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||||
|
return distributed_loader::populate_column_family(db, sstdir + "/staging", ks_name, cfname);
|
||||||
|
}).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
|
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
|
||||||
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
||||||
std::string msg =
|
std::string msg =
|
||||||
@@ -2903,6 +2947,7 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
|
|||||||
cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
|
cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
|
||||||
cfg.large_partition_handler = lp_handler;
|
cfg.large_partition_handler = lp_handler;
|
||||||
cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
|
cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
|
||||||
|
cfg.view_update_concurrency_semaphore_limit = _config.view_update_concurrency_semaphore_limit;
|
||||||
|
|
||||||
return cfg;
|
return cfg;
|
||||||
}
|
}
|
||||||
@@ -2930,6 +2975,7 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
|
|||||||
io_check(recursive_touch_directory, cfdir).get();
|
io_check(recursive_touch_directory, cfdir).get();
|
||||||
}
|
}
|
||||||
io_check(touch_directory, cfdirs[0] + "/upload").get();
|
io_check(touch_directory, cfdirs[0] + "/upload").get();
|
||||||
|
io_check(touch_directory, cfdirs[0] + "/staging").get();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3699,6 +3745,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
|
|||||||
cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();
|
cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();
|
||||||
|
|
||||||
cfg.view_update_concurrency_semaphore = &_view_update_concurrency_sem;
|
cfg.view_update_concurrency_semaphore = &_view_update_concurrency_sem;
|
||||||
|
cfg.view_update_concurrency_semaphore_limit = max_memory_pending_view_updates();
|
||||||
return cfg;
|
return cfg;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3796,6 +3843,8 @@ database::stop() {
|
|||||||
return parallel_for_each(_column_families, [this] (auto& val_pair) {
|
return parallel_for_each(_column_families, [this] (auto& val_pair) {
|
||||||
return val_pair.second->stop();
|
return val_pair.second->stop();
|
||||||
});
|
});
|
||||||
|
}).then([this] {
|
||||||
|
return _view_update_concurrency_sem.wait(max_memory_pending_view_updates());
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
if (_commitlog != nullptr) {
|
if (_commitlog != nullptr) {
|
||||||
return _commitlog->release();
|
return _commitlog->release();
|
||||||
@@ -4051,6 +4100,7 @@ seal_snapshot(sstring jsondir) {
|
|||||||
|
|
||||||
future<> table::snapshot(sstring name) {
|
future<> table::snapshot(sstring name) {
|
||||||
return flush().then([this, name = std::move(name)]() {
|
return flush().then([this, name = std::move(name)]() {
|
||||||
|
return with_semaphore(_sstable_deletion_sem, 1, [this, name = std::move(name)]() {
|
||||||
auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
|
auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
|
||||||
return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
|
return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
|
||||||
auto jsondir = _config.datadir + "/snapshots/" + name;
|
auto jsondir = _config.datadir + "/snapshots/" + name;
|
||||||
@@ -4110,6 +4160,7 @@ future<> table::snapshot(sstring name) {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4239,6 +4290,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
|
|||||||
_streaming_memtables_big.erase(it);
|
_streaming_memtables_big.erase(it);
|
||||||
return entry->flush_in_progress.close().then([this, entry] {
|
return entry->flush_in_progress.close().then([this, entry] {
|
||||||
for (auto&& sst : entry->sstables) {
|
for (auto&& sst : entry->sstables) {
|
||||||
|
sst.monitor->write_failed();
|
||||||
sst.sstable->mark_for_deletion();
|
sst.sstable->mark_for_deletion();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -4309,6 +4361,8 @@ future<int64_t>
|
|||||||
table::disable_sstable_write() {
|
table::disable_sstable_write() {
|
||||||
_sstable_writes_disabled_at = std::chrono::steady_clock::now();
|
_sstable_writes_disabled_at = std::chrono::steady_clock::now();
|
||||||
return _sstables_lock.write_lock().then([this] {
|
return _sstables_lock.write_lock().then([this] {
|
||||||
|
// _sstable_deletion_sem must be acquired after _sstables_lock.write_lock
|
||||||
|
return _sstable_deletion_sem.wait().then([this] {
|
||||||
if (_sstables->all()->empty()) {
|
if (_sstables->all()->empty()) {
|
||||||
return make_ready_future<int64_t>(0);
|
return make_ready_future<int64_t>(0);
|
||||||
}
|
}
|
||||||
@@ -4317,9 +4371,19 @@ table::disable_sstable_write() {
|
|||||||
max = std::max(max, s->generation());
|
max = std::max(max, s->generation());
|
||||||
}
|
}
|
||||||
return make_ready_future<int64_t>(max);
|
return make_ready_future<int64_t>(max);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::chrono::steady_clock::duration table::enable_sstable_write(int64_t new_generation) {
|
||||||
|
if (new_generation != -1) {
|
||||||
|
update_sstables_known_generation(new_generation);
|
||||||
|
}
|
||||||
|
_sstable_deletion_sem.signal();
|
||||||
|
_sstables_lock.write_unlock();
|
||||||
|
return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
|
||||||
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
|
std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
|
||||||
os << "org.apache.cassandra.config.UTMetaData@" << &m;
|
os << "org.apache.cassandra.config.UTMetaData@" << &m;
|
||||||
return os;
|
return os;
|
||||||
@@ -4417,6 +4481,14 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t memory_usage_of(const std::vector<frozen_mutation_and_schema>& ms) {
|
||||||
|
// Overhead of sending a view mutation, in terms of data structures used by the storage_proxy.
|
||||||
|
constexpr size_t base_overhead_bytes = 256;
|
||||||
|
return boost::accumulate(ms | boost::adaptors::transformed([] (const frozen_mutation_and_schema& m) {
|
||||||
|
return m.fm.representation().size();
|
||||||
|
}), size_t{base_overhead_bytes * ms.size()});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given some updates on the base table and the existing values for the rows affected by that update, generates the
|
* Given some updates on the base table and the existing values for the rows affected by that update, generates the
|
||||||
* mutations to be applied to the base table's views, and sends them to the paired view replicas.
|
* mutations to be applied to the base table's views, and sends them to the paired view replicas.
|
||||||
@@ -4433,75 +4505,15 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
|
|||||||
future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
|
future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
|
||||||
std::vector<view_ptr>&& views,
|
std::vector<view_ptr>&& views,
|
||||||
mutation&& m,
|
mutation&& m,
|
||||||
flat_mutation_reader_opt existings,
|
flat_mutation_reader_opt existings) const {
|
||||||
db::timeout_clock::time_point timeout) const {
|
|
||||||
auto base_token = m.token();
|
auto base_token = m.token();
|
||||||
return db::view::generate_view_updates(base,
|
return db::view::generate_view_updates(
|
||||||
std::move(views),
|
base,
|
||||||
flat_mutation_reader_from_mutations({std::move(m)}),
|
std::move(views),
|
||||||
std::move(existings)).then([this, timeout, base_token = std::move(base_token)] (auto&& updates) mutable {
|
flat_mutation_reader_from_mutations({std::move(m)}),
|
||||||
return seastar::get_units(*_config.view_update_concurrency_semaphore, 1, timeout).then(
|
std::move(existings)).then([this, base_token = std::move(base_token)] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
|
||||||
[this, base_token = std::move(base_token), updates = std::move(updates)] (auto units) mutable {
|
auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
|
||||||
db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats).handle_exception([units = std::move(units)] (auto ignored) { });
|
db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units)).handle_exception([] (auto ignored) { });
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given an update for the base table, calculates the set of potentially affected views,
|
|
||||||
* generates the relevant updates, and sends them to the paired view replicas.
|
|
||||||
*/
|
|
||||||
future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
|
|
||||||
//FIXME: Avoid unfreezing here.
|
|
||||||
auto m = fm.unfreeze(s);
|
|
||||||
auto& base = schema();
|
|
||||||
m.upgrade(base);
|
|
||||||
auto views = affected_views(base, m);
|
|
||||||
if (views.empty()) {
|
|
||||||
return make_ready_future<row_locker::lock_holder>();
|
|
||||||
}
|
|
||||||
auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
|
|
||||||
if (cr_ranges.empty()) {
|
|
||||||
return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }, timeout).then([] {
|
|
||||||
// In this case we are not doing a read-before-write, just a
|
|
||||||
// write, so no lock is needed.
|
|
||||||
return make_ready_future<row_locker::lock_holder>();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
// We read the whole set of regular columns in case the update now causes a base row to pass
|
|
||||||
// a view's filters, and a view happens to include columns that have no value in this update.
|
|
||||||
// Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
|
|
||||||
auto columns = boost::copy_range<std::vector<column_id>>(
|
|
||||||
base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
|
|
||||||
query::partition_slice::option_set opts;
|
|
||||||
opts.set(query::partition_slice::option::send_partition_key);
|
|
||||||
opts.set(query::partition_slice::option::send_clustering_key);
|
|
||||||
opts.set(query::partition_slice::option::send_timestamp);
|
|
||||||
opts.set(query::partition_slice::option::send_ttl);
|
|
||||||
auto slice = query::partition_slice(
|
|
||||||
std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
|
|
||||||
// Take the shard-local lock on the base-table row or partition as needed.
|
|
||||||
// We'll return this lock to the caller, which will release it after
|
|
||||||
// writing the base-table update.
|
|
||||||
future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
|
|
||||||
return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout] (row_locker::lock_holder lock) {
|
|
||||||
return do_with(
|
|
||||||
dht::partition_range::make_singular(m.decorated_key()),
|
|
||||||
std::move(slice),
|
|
||||||
std::move(m),
|
|
||||||
[base, views = std::move(views), lock = std::move(lock), this, timeout] (auto& pk, auto& slice, auto& m) mutable {
|
|
||||||
auto reader = this->make_reader(
|
|
||||||
base,
|
|
||||||
pk,
|
|
||||||
slice,
|
|
||||||
service::get_local_sstable_query_read_priority());
|
|
||||||
return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader), timeout).then([lock = std::move(lock)] () mutable {
|
|
||||||
// return the local partition/row lock we have taken so it
|
|
||||||
// remains locked until the caller is done modifying this
|
|
||||||
// partition/row and destroys the lock object.
|
|
||||||
return std::move(lock);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4606,8 +4618,17 @@ future<> table::populate_views(
|
|||||||
schema,
|
schema,
|
||||||
std::move(views),
|
std::move(views),
|
||||||
std::move(reader),
|
std::move(reader),
|
||||||
{ }).then([base_token = std::move(base_token), this] (auto&& updates) {
|
{ }).then([base_token = std::move(base_token), this] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
|
||||||
return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats);
|
size_t update_size = memory_usage_of(updates);
|
||||||
|
size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
|
||||||
|
return seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for).then(
|
||||||
|
[base_token = std::move(base_token),
|
||||||
|
updates = std::move(updates),
|
||||||
|
units_to_consume = update_size - units_to_wait_for,
|
||||||
|
this] (db::timeout_semaphore_units&& units) mutable {
|
||||||
|
units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, units_to_consume));
|
||||||
|
return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units));
|
||||||
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
70
database.hh
70
database.hh
@@ -77,6 +77,7 @@
|
|||||||
#include <seastar/core/metrics_registration.hh>
|
#include <seastar/core/metrics_registration.hh>
|
||||||
#include "tracing/trace_state.hh"
|
#include "tracing/trace_state.hh"
|
||||||
#include "db/view/view.hh"
|
#include "db/view/view.hh"
|
||||||
|
#include "db/view/view_update_backlog.hh"
|
||||||
#include "db/view/row_locking.hh"
|
#include "db/view/row_locking.hh"
|
||||||
#include "lister.hh"
|
#include "lister.hh"
|
||||||
#include "utils/phased_barrier.hh"
|
#include "utils/phased_barrier.hh"
|
||||||
@@ -279,6 +280,9 @@ struct cf_stats {
|
|||||||
int64_t clustering_filter_fast_path_count = 0;
|
int64_t clustering_filter_fast_path_count = 0;
|
||||||
// how many sstables survived the clustering key checks
|
// how many sstables survived the clustering key checks
|
||||||
int64_t surviving_sstables_after_clustering_filter = 0;
|
int64_t surviving_sstables_after_clustering_filter = 0;
|
||||||
|
|
||||||
|
// How many view updates were dropped due to overload.
|
||||||
|
int64_t dropped_view_updates = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
class cache_temperature {
|
class cache_temperature {
|
||||||
@@ -298,6 +302,8 @@ public:
|
|||||||
class table;
|
class table;
|
||||||
using column_family = table;
|
using column_family = table;
|
||||||
|
|
||||||
|
class database_sstable_write_monitor;
|
||||||
|
|
||||||
class table : public enable_lw_shared_from_this<table> {
|
class table : public enable_lw_shared_from_this<table> {
|
||||||
public:
|
public:
|
||||||
struct config {
|
struct config {
|
||||||
@@ -323,6 +329,7 @@ public:
|
|||||||
bool enable_metrics_reporting = false;
|
bool enable_metrics_reporting = false;
|
||||||
db::large_partition_handler* large_partition_handler;
|
db::large_partition_handler* large_partition_handler;
|
||||||
db::timeout_semaphore* view_update_concurrency_semaphore;
|
db::timeout_semaphore* view_update_concurrency_semaphore;
|
||||||
|
size_t view_update_concurrency_semaphore_limit;
|
||||||
};
|
};
|
||||||
struct no_commitlog {};
|
struct no_commitlog {};
|
||||||
struct stats {
|
struct stats {
|
||||||
@@ -395,7 +402,7 @@ private:
|
|||||||
// plan memtables and the resulting sstables are not made visible until
|
// plan memtables and the resulting sstables are not made visible until
|
||||||
// the streaming is complete.
|
// the streaming is complete.
|
||||||
struct monitored_sstable {
|
struct monitored_sstable {
|
||||||
std::unique_ptr<sstables::write_monitor> monitor;
|
std::unique_ptr<database_sstable_write_monitor> monitor;
|
||||||
sstables::shared_sstable sstable;
|
sstables::shared_sstable sstable;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -432,8 +439,16 @@ private:
|
|||||||
// but for correct compaction we need to start the compaction only after
|
// but for correct compaction we need to start the compaction only after
|
||||||
// reading all sstables.
|
// reading all sstables.
|
||||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
|
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
|
||||||
|
// sstables that should not be compacted (e.g. because they need to be used
|
||||||
|
// to generate view updates later)
|
||||||
|
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
|
||||||
// Control background fibers waiting for sstables to be deleted
|
// Control background fibers waiting for sstables to be deleted
|
||||||
seastar::gate _sstable_deletion_gate;
|
seastar::gate _sstable_deletion_gate;
|
||||||
|
// This semaphore ensures that an operation like snapshot won't have its selected
|
||||||
|
// sstables deleted by compaction in parallel, a race condition which could
|
||||||
|
// easily result in failure.
|
||||||
|
// Locking order: must be acquired either independently or after _sstables_lock
|
||||||
|
seastar::semaphore _sstable_deletion_sem = {1};
|
||||||
// There are situations in which we need to stop writing sstables. Flushers will take
|
// There are situations in which we need to stop writing sstables. Flushers will take
|
||||||
// the read lock, and the ones that wish to stop that process will take the write lock.
|
// the read lock, and the ones that wish to stop that process will take the write lock.
|
||||||
rwlock _sstables_lock;
|
rwlock _sstables_lock;
|
||||||
@@ -485,6 +500,11 @@ private:
|
|||||||
utils::phased_barrier _pending_reads_phaser;
|
utils::phased_barrier _pending_reads_phaser;
|
||||||
public:
|
public:
|
||||||
future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
|
future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
|
||||||
|
void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
|
||||||
|
sstables::shared_sstable get_staging_sstable(uint64_t generation) {
|
||||||
|
auto it = _sstables_staging.find(generation);
|
||||||
|
return it != _sstables_staging.end() ? it->second : nullptr;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
|
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
|
||||||
// Adds new sstable to the set of sstables
|
// Adds new sstable to the set of sstables
|
||||||
@@ -618,6 +638,14 @@ public:
|
|||||||
tracing::trace_state_ptr trace_state = nullptr,
|
tracing::trace_state_ptr trace_state = nullptr,
|
||||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||||
|
flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
|
||||||
|
sstables::shared_sstable sst,
|
||||||
|
const dht::partition_range& range,
|
||||||
|
const query::partition_slice& slice,
|
||||||
|
const io_priority_class& pc = default_priority_class(),
|
||||||
|
tracing::trace_state_ptr trace_state = nullptr,
|
||||||
|
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||||
|
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||||
|
|
||||||
flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
|
flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
|
||||||
auto& full_slice = schema->full_slice();
|
auto& full_slice = schema->full_slice();
|
||||||
@@ -632,9 +660,13 @@ public:
|
|||||||
flat_mutation_reader make_streaming_reader(schema_ptr schema,
|
flat_mutation_reader make_streaming_reader(schema_ptr schema,
|
||||||
const dht::partition_range_vector& ranges) const;
|
const dht::partition_range_vector& ranges) const;
|
||||||
|
|
||||||
sstables::shared_sstable make_streaming_sstable_for_write();
|
sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
|
||||||
|
sstables::shared_sstable make_streaming_staging_sstable() {
|
||||||
|
return make_streaming_sstable_for_write("staging");
|
||||||
|
}
|
||||||
|
|
||||||
mutation_source as_mutation_source() const;
|
mutation_source as_mutation_source() const;
|
||||||
|
mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;
|
||||||
|
|
||||||
void set_virtual_reader(mutation_source virtual_reader) {
|
void set_virtual_reader(mutation_source virtual_reader) {
|
||||||
_virtual_reader = std::move(virtual_reader);
|
_virtual_reader = std::move(virtual_reader);
|
||||||
@@ -706,13 +738,7 @@ public:
|
|||||||
|
|
||||||
// SSTable writes are now allowed again, and generation is updated to new_generation if != -1
|
// SSTable writes are now allowed again, and generation is updated to new_generation if != -1
|
||||||
// returns the amount of microseconds elapsed since we disabled writes.
|
// returns the amount of microseconds elapsed since we disabled writes.
|
||||||
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
|
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation);
|
||||||
if (new_generation != -1) {
|
|
||||||
update_sstables_known_generation(new_generation);
|
|
||||||
}
|
|
||||||
_sstables_lock.write_unlock();
|
|
||||||
return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the generation numbers are sequential, starting from "start".
|
// Make sure the generation numbers are sequential, starting from "start".
|
||||||
// Generations before "start" are left untouched.
|
// Generations before "start" are left untouched.
|
||||||
@@ -842,6 +868,8 @@ public:
|
|||||||
void clear_views();
|
void clear_views();
|
||||||
const std::vector<view_ptr>& views() const;
|
const std::vector<view_ptr>& views() const;
|
||||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
|
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
|
||||||
|
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
|
||||||
|
future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
|
||||||
void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
|
void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
|
||||||
std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);
|
std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);
|
||||||
|
|
||||||
@@ -859,13 +887,17 @@ public:
|
|||||||
dht::token base_token,
|
dht::token base_token,
|
||||||
flat_mutation_reader&&);
|
flat_mutation_reader&&);
|
||||||
|
|
||||||
|
reader_concurrency_semaphore& read_concurrency_semaphore() {
|
||||||
|
return *_config.read_concurrency_semaphore;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
|
||||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
||||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||||
std::vector<view_ptr>&& views,
|
std::vector<view_ptr>&& views,
|
||||||
mutation&& m,
|
mutation&& m,
|
||||||
flat_mutation_reader_opt existings,
|
flat_mutation_reader_opt existings) const;
|
||||||
db::timeout_clock::time_point timeout) const;
|
|
||||||
|
|
||||||
mutable row_locker _row_locker;
|
mutable row_locker _row_locker;
|
||||||
future<row_locker::lock_holder> local_base_lock(
|
future<row_locker::lock_holder> local_base_lock(
|
||||||
@@ -1055,6 +1087,7 @@ public:
|
|||||||
seastar::scheduling_group streaming_scheduling_group;
|
seastar::scheduling_group streaming_scheduling_group;
|
||||||
bool enable_metrics_reporting = false;
|
bool enable_metrics_reporting = false;
|
||||||
db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
|
db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
|
||||||
|
size_t view_update_concurrency_semaphore_limit;
|
||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
|
std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
|
||||||
@@ -1156,6 +1189,7 @@ private:
|
|||||||
static const size_t max_count_system_concurrent_reads{10};
|
static const size_t max_count_system_concurrent_reads{10};
|
||||||
size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
|
size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
|
||||||
static constexpr size_t max_concurrent_sstable_loads() { return 3; }
|
static constexpr size_t max_concurrent_sstable_loads() { return 3; }
|
||||||
|
size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }
|
||||||
|
|
||||||
struct db_stats {
|
struct db_stats {
|
||||||
uint64_t total_writes = 0;
|
uint64_t total_writes = 0;
|
||||||
@@ -1192,7 +1226,7 @@ private:
|
|||||||
|
|
||||||
semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};
|
semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};
|
||||||
|
|
||||||
db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
|
db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};
|
||||||
|
|
||||||
cache_tracker _row_cache_tracker;
|
cache_tracker _row_cache_tracker;
|
||||||
|
|
||||||
@@ -1399,6 +1433,12 @@ public:
|
|||||||
std::unordered_set<sstring> get_initial_tokens();
|
std::unordered_set<sstring> get_initial_tokens();
|
||||||
std::experimental::optional<gms::inet_address> get_replace_address();
|
std::experimental::optional<gms::inet_address> get_replace_address();
|
||||||
bool is_replacing();
|
bool is_replacing();
|
||||||
|
reader_concurrency_semaphore& user_read_concurrency_sem() {
|
||||||
|
return _read_concurrency_sem;
|
||||||
|
}
|
||||||
|
reader_concurrency_semaphore& streaming_read_concurrency_sem() {
|
||||||
|
return _streaming_concurrency_sem;
|
||||||
|
}
|
||||||
reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
|
reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
|
||||||
return _system_read_concurrency_sem;
|
return _system_read_concurrency_sem;
|
||||||
}
|
}
|
||||||
@@ -1423,11 +1463,17 @@ public:
|
|||||||
return _querier_cache;
|
return _querier_cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
db::view::update_backlog get_view_update_backlog() const {
|
||||||
|
return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
|
||||||
|
}
|
||||||
|
|
||||||
friend class distributed_loader;
|
friend class distributed_loader;
|
||||||
};
|
};
|
||||||
|
|
||||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);
|
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);
|
||||||
|
|
||||||
|
bool is_internal_keyspace(const sstring& name);
|
||||||
|
|
||||||
class distributed_loader {
|
class distributed_loader {
|
||||||
public:
|
public:
|
||||||
static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
|
static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
|
||||||
|
|||||||
@@ -395,10 +395,8 @@ std::unordered_set<gms::inet_address> db::batchlog_manager::endpoint_filter(cons
|
|||||||
|
|
||||||
// grab a random member of up to two racks
|
// grab a random member of up to two racks
|
||||||
for (auto& rack : racks) {
|
for (auto& rack : racks) {
|
||||||
auto rack_members = validated.bucket(rack);
|
|
||||||
auto n = validated.bucket_size(rack_members);
|
|
||||||
auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
|
auto cpy = boost::copy_range<std::vector<gms::inet_address>>(validated.equal_range(rack) | boost::adaptors::map_values);
|
||||||
std::uniform_int_distribution<size_t> rdist(0, n - 1);
|
std::uniform_int_distribution<size_t> rdist(0, cpy.size() - 1);
|
||||||
result.emplace(cpy[rdist(_e1)]);
|
result.emplace(cpy[rdist(_e1)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -689,6 +689,8 @@ public:
|
|||||||
// but all previous write/flush pairs.
|
// but all previous write/flush pairs.
|
||||||
return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
|
return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
|
||||||
auto view = fragmented_temporary_buffer::view(buf);
|
auto view = fragmented_temporary_buffer::view(buf);
|
||||||
|
view.remove_suffix(buf.size_bytes() - size);
|
||||||
|
assert(size == view.size_bytes());
|
||||||
return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
|
return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
|
||||||
if (view.empty()) {
|
if (view.empty()) {
|
||||||
return make_ready_future<>();
|
return make_ready_future<>();
|
||||||
@@ -1187,6 +1189,34 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// \brief Helper for ensuring a file is closed if an exception is thrown.
|
||||||
|
///
|
||||||
|
/// The file provided by the file_fut future is passed to func.
|
||||||
|
/// * If func throws an exception E, the file is closed and we return
|
||||||
|
/// a failed future with E.
|
||||||
|
/// * If func returns a value V, the file is not closed and we return
|
||||||
|
/// a future with V.
|
||||||
|
/// Note that when an exception is not thrown, it is the
|
||||||
|
/// responsibility of func to make sure the file will be closed. It
|
||||||
|
/// can close the file itself, return it, or store it somewhere.
|
||||||
|
///
|
||||||
|
/// \tparam Func The type of function this wraps
|
||||||
|
/// \param file_fut A future that produces a file
|
||||||
|
/// \param func A function that uses a file
|
||||||
|
/// \return A future that passes the file produced by file_fut to func
|
||||||
|
/// and closes it if func fails
|
||||||
|
template <typename Func>
|
||||||
|
static auto close_on_failure(future<file> file_fut, Func func) {
|
||||||
|
return file_fut.then([func = std::move(func)](file f) {
|
||||||
|
return futurize_apply(func, f).handle_exception([f] (std::exception_ptr e) mutable {
|
||||||
|
return f.close().then_wrapped([f, e = std::move(e)] (future<> x) {
|
||||||
|
using futurator = futurize<std::result_of_t<Func(file)>>;
|
||||||
|
return futurator::make_exception_future(e);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
|
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
|
||||||
static const auto flags = open_flags::wo | open_flags::create;
|
static const auto flags = open_flags::wo | open_flags::create;
|
||||||
|
|
||||||
@@ -1217,7 +1247,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
|||||||
return fut;
|
return fut;
|
||||||
});
|
});
|
||||||
|
|
||||||
return fut.then([this, d, active, filename](file f) {
|
return close_on_failure(std::move(fut), [this, d, active, filename] (file f) {
|
||||||
f = make_checked_file(commit_error_handler, f);
|
f = make_checked_file(commit_error_handler, f);
|
||||||
// xfs doesn't like files extended betond eof, so enlarge the file
|
// xfs doesn't like files extended betond eof, so enlarge the file
|
||||||
return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
|
return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
|
||||||
@@ -1673,14 +1703,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
|
|||||||
// No commit_io_check needed in the log reader since the database will fail
|
// No commit_io_check needed in the log reader since the database will fail
|
||||||
// on error at startup if required
|
// on error at startup if required
|
||||||
future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
|
future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
|
||||||
db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||||
struct work {
|
struct work {
|
||||||
private:
|
private:
|
||||||
file_input_stream_options make_file_input_stream_options() {
|
file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
|
||||||
file_input_stream_options fo;
|
file_input_stream_options fo;
|
||||||
fo.buffer_size = db::commitlog::segment::default_size;
|
fo.buffer_size = db::commitlog::segment::default_size;
|
||||||
fo.read_ahead = 10;
|
fo.read_ahead = 10;
|
||||||
fo.io_priority_class = service::get_local_commitlog_priority();
|
fo.io_priority_class = read_io_prio_class;
|
||||||
return fo;
|
return fo;
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
@@ -1699,8 +1729,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
|||||||
bool header = true;
|
bool header = true;
|
||||||
bool failed = false;
|
bool failed = false;
|
||||||
|
|
||||||
work(file f, position_type o = 0)
|
work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
|
||||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
|
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
|
||||||
}
|
}
|
||||||
work(work&&) = default;
|
work(work&&) = default;
|
||||||
|
|
||||||
@@ -1755,7 +1785,7 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (magic != segment::segment_magic) {
|
if (magic != segment::segment_magic) {
|
||||||
throw std::invalid_argument("Not a scylla format commitlog file");
|
throw invalid_segment_format();
|
||||||
}
|
}
|
||||||
crc32_nbo crc;
|
crc32_nbo crc;
|
||||||
crc.process(ver);
|
crc.process(ver);
|
||||||
@@ -1764,7 +1794,7 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
|||||||
|
|
||||||
auto cs = crc.checksum();
|
auto cs = crc.checksum();
|
||||||
if (cs != checksum) {
|
if (cs != checksum) {
|
||||||
throw std::runtime_error("Checksum error in file header");
|
throw header_checksum_error();
|
||||||
}
|
}
|
||||||
|
|
||||||
this->id = id;
|
this->id = id;
|
||||||
@@ -1918,9 +1948,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
|||||||
return fut;
|
return fut;
|
||||||
});
|
});
|
||||||
|
|
||||||
return fut.then([off, next](file f) {
|
return fut.then([off, next, read_io_prio_class] (file f) {
|
||||||
f = make_checked_file(commit_error_handler, std::move(f));
|
f = make_checked_file(commit_error_handler, std::move(f));
|
||||||
auto w = make_lw_shared<work>(std::move(f), off);
|
auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
|
||||||
auto ret = w->s.listen(next);
|
auto ret = w->s.listen(next);
|
||||||
|
|
||||||
w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
|
w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
|
||||||
|
|||||||
@@ -342,20 +342,42 @@ public:
|
|||||||
|
|
||||||
typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;
|
typedef std::function<future<>(temporary_buffer<char>, replay_position)> commit_load_reader_func;
|
||||||
|
|
||||||
class segment_data_corruption_error: public std::runtime_error {
|
class segment_error : public std::exception {};
|
||||||
|
|
||||||
|
class segment_data_corruption_error: public segment_error {
|
||||||
|
std::string _msg;
|
||||||
public:
|
public:
|
||||||
segment_data_corruption_error(std::string msg, uint64_t s)
|
segment_data_corruption_error(std::string msg, uint64_t s)
|
||||||
: std::runtime_error(msg), _bytes(s) {
|
: _msg(std::move(msg)), _bytes(s) {
|
||||||
}
|
}
|
||||||
uint64_t bytes() const {
|
uint64_t bytes() const {
|
||||||
return _bytes;
|
return _bytes;
|
||||||
}
|
}
|
||||||
|
virtual const char* what() const noexcept {
|
||||||
|
return _msg.c_str();
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
uint64_t _bytes;
|
uint64_t _bytes;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class invalid_segment_format : public segment_error {
|
||||||
|
static constexpr const char* _msg = "Not a scylla format commitlog file";
|
||||||
|
public:
|
||||||
|
virtual const char* what() const noexcept {
|
||||||
|
return _msg;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class header_checksum_error : public segment_error {
|
||||||
|
static constexpr const char* _msg = "Checksum error in file header";
|
||||||
|
public:
|
||||||
|
virtual const char* what() const noexcept {
|
||||||
|
return _msg;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
|
static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
|
||||||
const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||||
private:
|
private:
|
||||||
commitlog(config);
|
commitlog(config);
|
||||||
|
|
||||||
|
|||||||
@@ -34,7 +34,8 @@ public:
|
|||||||
commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
|
commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
|
||||||
: _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
|
: _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
|
||||||
const stdx::optional<column_mapping>& mapping() const { return _mapping; }
|
const stdx::optional<column_mapping>& mapping() const { return _mapping; }
|
||||||
const frozen_mutation& mutation() const { return _mutation; }
|
const frozen_mutation& mutation() const & { return _mutation; }
|
||||||
|
frozen_mutation&& mutation() && { return std::move(_mutation); }
|
||||||
};
|
};
|
||||||
|
|
||||||
class commitlog_entry_writer {
|
class commitlog_entry_writer {
|
||||||
@@ -80,5 +81,6 @@ public:
|
|||||||
commitlog_entry_reader(const temporary_buffer<char>& buffer);
|
commitlog_entry_reader(const temporary_buffer<char>& buffer);
|
||||||
|
|
||||||
const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
|
const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
|
||||||
const frozen_mutation& mutation() const { return _ce.mutation(); }
|
const frozen_mutation& mutation() const & { return _ce.mutation(); }
|
||||||
|
frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -58,6 +58,7 @@
|
|||||||
#include "converting_mutation_partition_applier.hh"
|
#include "converting_mutation_partition_applier.hh"
|
||||||
#include "schema_registry.hh"
|
#include "schema_registry.hh"
|
||||||
#include "commitlog_entry.hh"
|
#include "commitlog_entry.hh"
|
||||||
|
#include "service/priority_manager.hh"
|
||||||
|
|
||||||
static logging::logger rlogger("commitlog_replayer");
|
static logging::logger rlogger("commitlog_replayer");
|
||||||
|
|
||||||
@@ -163,7 +164,7 @@ future<> db::commitlog_replayer::impl::init() {
|
|||||||
// Get all truncation records for the CF and initialize max rps if
|
// Get all truncation records for the CF and initialize max rps if
|
||||||
// present. Cannot do this on demand, as there may be no sstables to
|
// present. Cannot do this on demand, as there may be no sstables to
|
||||||
// mark the CF as "needed".
|
// mark the CF as "needed".
|
||||||
return db::system_keyspace::get_truncated_position(uuid).then([&map, &uuid](std::vector<db::replay_position> tpps) {
|
return db::system_keyspace::get_truncated_position(uuid).then([&map, uuid](std::vector<db::replay_position> tpps) {
|
||||||
for (auto& p : tpps) {
|
for (auto& p : tpps) {
|
||||||
rlogger.trace("CF {} truncated at {}", uuid, p);
|
rlogger.trace("CF {} truncated at {}", uuid, p);
|
||||||
auto& pp = map[p.shard_id()][uuid];
|
auto& pp = map[p.shard_id()][uuid];
|
||||||
@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
|
|||||||
auto s = make_lw_shared<stats>();
|
auto s = make_lw_shared<stats>();
|
||||||
auto& exts = _qp.local().db().local().get_config().extensions();
|
auto& exts = _qp.local().db().local().get_config().extensions();
|
||||||
|
|
||||||
return db::commitlog::read_log_file(file,
|
return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
|
||||||
std::bind(&impl::process, this, s.get(), std::placeholders::_1,
|
std::bind(&impl::process, this, s.get(), std::placeholders::_1,
|
||||||
std::placeholders::_2), p, &exts).then([](auto s) {
|
std::placeholders::_2), p, &exts).then([](auto s) {
|
||||||
auto f = s->done();
|
auto f = s->done();
|
||||||
|
|||||||
@@ -102,6 +102,8 @@ db::config::config()
|
|||||||
db::config::~config()
|
db::config::~config()
|
||||||
{}
|
{}
|
||||||
|
|
||||||
|
const sstring db::config::default_tls_priority("SECURE128:-VERS-TLS1.0");
|
||||||
|
|
||||||
namespace utils {
|
namespace utils {
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
|
|||||||
16
db/config.hh
16
db/config.hh
@@ -155,6 +155,9 @@ public:
|
|||||||
val(hints_directory, sstring, "/var/lib/scylla/hints", Used, \
|
val(hints_directory, sstring, "/var/lib/scylla/hints", Used, \
|
||||||
"The directory where hints files are stored if hinted handoff is enabled." \
|
"The directory where hints files are stored if hinted handoff is enabled." \
|
||||||
) \
|
) \
|
||||||
|
val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used, \
|
||||||
|
"The directory where materialized-view updates are stored while a view replica is unreachable." \
|
||||||
|
) \
|
||||||
val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
|
val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
|
||||||
"The directory location where table key and row caches are stored." \
|
"The directory location where table key and row caches are stored." \
|
||||||
) \
|
) \
|
||||||
@@ -453,7 +456,7 @@ public:
|
|||||||
"The maximum number of tombstones a query can scan before aborting." \
|
"The maximum number of tombstones a query can scan before aborting." \
|
||||||
) \
|
) \
|
||||||
/* Network timeout settings */ \
|
/* Network timeout settings */ \
|
||||||
val(range_request_timeout_in_ms, uint32_t, 10000, Unused, \
|
val(range_request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||||
"The time in milliseconds that the coordinator waits for sequential or index scans to complete." \
|
"The time in milliseconds that the coordinator waits for sequential or index scans to complete." \
|
||||||
) \
|
) \
|
||||||
val(read_request_timeout_in_ms, uint32_t, 5000, Used, \
|
val(read_request_timeout_in_ms, uint32_t, 5000, Used, \
|
||||||
@@ -472,7 +475,7 @@ public:
|
|||||||
"The time in milliseconds that the coordinator waits for write operations to complete.\n" \
|
"The time in milliseconds that the coordinator waits for write operations to complete.\n" \
|
||||||
"Related information: About hinted handoff writes" \
|
"Related information: About hinted handoff writes" \
|
||||||
) \
|
) \
|
||||||
val(request_timeout_in_ms, uint32_t, 10000, Unused, \
|
val(request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||||
"The default timeout for other, miscellaneous operations.\n" \
|
"The default timeout for other, miscellaneous operations.\n" \
|
||||||
"Related information: About hinted handoff writes" \
|
"Related information: About hinted handoff writes" \
|
||||||
) \
|
) \
|
||||||
@@ -578,7 +581,7 @@ public:
|
|||||||
val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused, \
|
val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused, \
|
||||||
"The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval." \
|
"The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval." \
|
||||||
) \
|
) \
|
||||||
val(hinted_handoff_enabled, sstring, "false", Used, \
|
val(hinted_handoff_enabled, sstring, "true", Used, \
|
||||||
"Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
|
"Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
|
||||||
"Related information: About hinted handoff writes" \
|
"Related information: About hinted handoff writes" \
|
||||||
) \
|
) \
|
||||||
@@ -621,7 +624,7 @@ public:
|
|||||||
val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused, \
|
val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused, \
|
||||||
"Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting." \
|
"Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting." \
|
||||||
) \
|
) \
|
||||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Unused, \
|
val(thrift_max_message_length_in_mb, uint32_t, 16, Used, \
|
||||||
"The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)." \
|
"The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)." \
|
||||||
) \
|
) \
|
||||||
/* Security properties */ \
|
/* Security properties */ \
|
||||||
@@ -739,7 +742,8 @@ public:
|
|||||||
" Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
|
" Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
|
||||||
val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
|
val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
|
||||||
val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
|
val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
|
||||||
val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format; FOR TESTING PURPOSES ONLY - TO BE REMOVED BEFORE RELEASE") \
|
val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
|
||||||
|
val(abort_on_internal_error, bool, false, Used, "Abort the server instead of throwing exception when internal invariants are violated.") \
|
||||||
/* done! */
|
/* done! */
|
||||||
|
|
||||||
#define _make_value_member(name, type, deflt, status, desc, ...) \
|
#define _make_value_member(name, type, deflt, status, desc, ...) \
|
||||||
@@ -753,6 +757,8 @@ public:
|
|||||||
add_options(boost::program_options::options_description_easy_init&);
|
add_options(boost::program_options::options_description_easy_init&);
|
||||||
|
|
||||||
const db::extensions& extensions() const;
|
const db::extensions& extensions() const;
|
||||||
|
|
||||||
|
static const sstring default_tls_priority;
|
||||||
private:
|
private:
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct log_legacy_value : public named_value<T, value_status::Used> {
|
struct log_legacy_value : public named_value<T, value_status::Used> {
|
||||||
|
|||||||
@@ -35,6 +35,7 @@
|
|||||||
#include "disk-error-handler.hh"
|
#include "disk-error-handler.hh"
|
||||||
#include "lister.hh"
|
#include "lister.hh"
|
||||||
#include "db/timeout_clock.hh"
|
#include "db/timeout_clock.hh"
|
||||||
|
#include "service/priority_manager.hh"
|
||||||
|
|
||||||
using namespace std::literals::chrono_literals;
|
using namespace std::literals::chrono_literals;
|
||||||
|
|
||||||
@@ -78,6 +79,12 @@ void manager::register_metrics(const sstring& group_name) {
|
|||||||
|
|
||||||
sm::make_derive("sent", _stats.sent,
|
sm::make_derive("sent", _stats.sent,
|
||||||
sm::description("Number of sent hints.")),
|
sm::description("Number of sent hints.")),
|
||||||
|
|
||||||
|
sm::make_derive("discarded", _stats.discarded,
|
||||||
|
sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
|
||||||
|
|
||||||
|
sm::make_derive("corrupted_files", _stats.corrupted_files,
|
||||||
|
sm::description("Number of hints files that were discarded during sending because the file was corrupted.")),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,6 +102,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
|
|||||||
return compute_hints_dir_device_id();
|
return compute_hints_dir_device_id();
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
_strorage_service_anchor->register_subscriber(this);
|
_strorage_service_anchor->register_subscriber(this);
|
||||||
|
set_started();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -105,12 +113,12 @@ future<> manager::stop() {
|
|||||||
_strorage_service_anchor->unregister_subscriber(this);
|
_strorage_service_anchor->unregister_subscriber(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
_stopping = true;
|
set_stopping();
|
||||||
|
|
||||||
return _draining_eps_gate.close().finally([this] {
|
return _draining_eps_gate.close().finally([this] {
|
||||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||||
return pair.second.stop();
|
return pair.second.stop();
|
||||||
}).finally([this] {
|
}).finally([this] {
|
||||||
_ep_managers.clear();
|
_ep_managers.clear();
|
||||||
manager_logger.info("Stopped");
|
manager_logger.info("Stopped");
|
||||||
}).discard_result();
|
}).discard_result();
|
||||||
@@ -231,6 +239,8 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
|
|||||||
manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
|
manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, manager& shard_manager)
|
||||||
: _key(key)
|
: _key(key)
|
||||||
, _shard_manager(shard_manager)
|
, _shard_manager(shard_manager)
|
||||||
|
, _file_update_mutex_ptr(make_lw_shared<seastar::shared_mutex>())
|
||||||
|
, _file_update_mutex(*_file_update_mutex_ptr)
|
||||||
, _state(state_set::of<state::stopped>())
|
, _state(state_set::of<state::stopped>())
|
||||||
, _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
|
, _hints_dir(_shard_manager.hints_dir() / format("{}", _key).c_str())
|
||||||
, _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
|
, _sender(*this, _shard_manager.local_storage_proxy(), _shard_manager.local_db(), _shard_manager.local_gossiper())
|
||||||
@@ -239,6 +249,8 @@ manager::end_point_hints_manager::end_point_hints_manager(const key_type& key, m
|
|||||||
manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
|
manager::end_point_hints_manager::end_point_hints_manager(end_point_hints_manager&& other)
|
||||||
: _key(other._key)
|
: _key(other._key)
|
||||||
, _shard_manager(other._shard_manager)
|
, _shard_manager(other._shard_manager)
|
||||||
|
, _file_update_mutex_ptr(std::move(other._file_update_mutex_ptr))
|
||||||
|
, _file_update_mutex(*_file_update_mutex_ptr)
|
||||||
, _state(other._state)
|
, _state(other._state)
|
||||||
, _hints_dir(std::move(other._hints_dir))
|
, _hints_dir(std::move(other._hints_dir))
|
||||||
, _sender(other._sender, *this)
|
, _sender(other._sender, *this)
|
||||||
@@ -277,7 +289,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
|
bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
|
||||||
if (_stopping || !can_hint_for(ep)) {
|
if (stopping() || !started() || !can_hint_for(ep)) {
|
||||||
manager_logger.trace("Can't store a hint to {}", ep);
|
manager_logger.trace("Can't store a hint to {}", ep);
|
||||||
++_stats.dropped;
|
++_stats.dropped;
|
||||||
return false;
|
return false;
|
||||||
@@ -380,7 +392,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||||
return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
|
return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
|
||||||
// The fact that we send with CL::ALL in both cases below ensures that new hints are not going
|
// The fact that we send with CL::ALL in both cases below ensures that new hints are not going
|
||||||
// to be generated as a result of hints sending.
|
// to be generated as a result of hints sending.
|
||||||
@@ -392,7 +404,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
|
|||||||
// FIXME: using 1h as infinite timeout. If a node is down, we should get an
|
// FIXME: using 1h as infinite timeout. If a node is down, we should get an
|
||||||
// unavailable exception.
|
// unavailable exception.
|
||||||
auto timeout = db::timeout_clock::now() + 1h;
|
auto timeout = db::timeout_clock::now() + 1h;
|
||||||
return _proxy.mutate({std::move(m)}, consistency_level::ALL, timeout, nullptr);
|
//FIXME: Add required frozen_mutation overloads
|
||||||
|
return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -418,21 +431,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||||
hint_entry_reader hr(buf);
|
hint_entry_reader hr(buf);
|
||||||
auto& fm = hr.mutation();
|
auto& fm = hr.mutation();
|
||||||
auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
|
auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
|
||||||
auto& cf = _db.find_column_family(fm.column_family_id());
|
auto schema = _db.find_schema(fm.column_family_id());
|
||||||
|
|
||||||
if (cf.schema()->version() != fm.schema_version()) {
|
if (schema->version() != fm.schema_version()) {
|
||||||
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
|
mutation m(schema, fm.decorated_key(*schema));
|
||||||
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
|
converting_mutation_partition_applier v(cm, *schema, m.partition());
|
||||||
fm.partition().accept(cm, v);
|
fm.partition().accept(cm, v);
|
||||||
|
return {freeze(m), std::move(schema)};
|
||||||
return std::move(m);
|
|
||||||
} else {
|
|
||||||
return fm.unfreeze(cf.schema());
|
|
||||||
}
|
}
|
||||||
|
return {std::move(hr).mutation(), std::move(schema)};
|
||||||
}
|
}
|
||||||
|
|
||||||
const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
|
const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
|
||||||
@@ -502,35 +513,42 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void manager::drain_for(gms::inet_address endpoint) {
|
void manager::drain_for(gms::inet_address endpoint) {
|
||||||
if (_stopping) {
|
if (stopping()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);
|
manager_logger.trace("on_leave_cluster: {} is removed/decommissioned", endpoint);
|
||||||
|
|
||||||
with_gate(_draining_eps_gate, [this, endpoint] {
|
with_gate(_draining_eps_gate, [this, endpoint] {
|
||||||
return futurize_apply([this, endpoint] () {
|
return with_semaphore(drain_lock(), 1, [this, endpoint] {
|
||||||
if (utils::fb_utilities::is_me(endpoint)) {
|
return futurize_apply([this, endpoint] () {
|
||||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
if (utils::fb_utilities::is_me(endpoint)) {
|
||||||
return pair.second.stop(drain::yes).finally([&pair] {
|
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||||
return remove_file(pair.second.hints_dir().c_str());
|
return pair.second.stop(drain::yes).finally([&pair] {
|
||||||
|
return with_file_update_mutex(pair.second, [&pair] {
|
||||||
|
return remove_file(pair.second.hints_dir().c_str());
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}).finally([this] {
|
||||||
|
_ep_managers.clear();
|
||||||
});
|
});
|
||||||
}).finally([this] {
|
} else {
|
||||||
_ep_managers.clear();
|
ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
|
||||||
});
|
if (ep_manager_it != ep_managers_end()) {
|
||||||
} else {
|
return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, &ep_man = ep_manager_it->second] {
|
||||||
ep_managers_map_type::iterator ep_manager_it = find_ep_manager(endpoint);
|
return with_file_update_mutex(ep_man, [&ep_man] {
|
||||||
if (ep_manager_it != ep_managers_end()) {
|
return remove_file(ep_man.hints_dir().c_str());
|
||||||
return ep_manager_it->second.stop(drain::yes).finally([this, endpoint, hints_dir = ep_manager_it->second.hints_dir()] {
|
}).finally([this, endpoint] {
|
||||||
_ep_managers.erase(endpoint);
|
_ep_managers.erase(endpoint);
|
||||||
return remove_file(hints_dir.c_str());
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return make_ready_future<>();
|
return make_ready_future<>();
|
||||||
}
|
}
|
||||||
}).handle_exception([endpoint] (auto eptr) {
|
}).handle_exception([endpoint] (auto eptr) {
|
||||||
manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
|
manager_logger.error("Exception when draining {}: {}", endpoint, eptr);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -543,6 +561,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
|
|||||||
, _resource_manager(_shard_manager._resource_manager)
|
, _resource_manager(_shard_manager._resource_manager)
|
||||||
, _proxy(local_storage_proxy)
|
, _proxy(local_storage_proxy)
|
||||||
, _db(local_db)
|
, _db(local_db)
|
||||||
|
, _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
|
||||||
, _gossiper(local_gossiper)
|
, _gossiper(local_gossiper)
|
||||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||||
{}
|
{}
|
||||||
@@ -555,6 +574,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
|
|||||||
, _resource_manager(_shard_manager._resource_manager)
|
, _resource_manager(_shard_manager._resource_manager)
|
||||||
, _proxy(other._proxy)
|
, _proxy(other._proxy)
|
||||||
, _db(other._db)
|
, _db(other._db)
|
||||||
|
, _hints_cpu_sched_group(other._hints_cpu_sched_group)
|
||||||
, _gossiper(other._gossiper)
|
, _gossiper(other._gossiper)
|
||||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||||
{}
|
{}
|
||||||
@@ -610,7 +630,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
|
|||||||
}
|
}
|
||||||
|
|
||||||
void manager::end_point_hints_manager::sender::start() {
|
void manager::end_point_hints_manager::sender::start() {
|
||||||
_stopped = seastar::async([this] {
|
seastar::thread_attributes attr;
|
||||||
|
|
||||||
|
attr.sched_group = _hints_cpu_sched_group;
|
||||||
|
_stopped = seastar::async(std::move(attr), [this] {
|
||||||
manager_logger.trace("ep_manager({})::sender: started", end_point_key());
|
manager_logger.trace("ep_manager({})::sender: started", end_point_key());
|
||||||
while (!stopping()) {
|
while (!stopping()) {
|
||||||
try {
|
try {
|
||||||
@@ -630,10 +653,11 @@ void manager::end_point_hints_manager::sender::start() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
|
future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
|
||||||
keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
|
keyspace& ks = _db.find_keyspace(m.s->ks_name());
|
||||||
auto& rs = ks.get_replication_strategy();
|
auto& rs = ks.get_replication_strategy();
|
||||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
|
auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
|
||||||
|
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));
|
||||||
|
|
||||||
return do_send_one_mutation(std::move(m), natural_endpoints);
|
return do_send_one_mutation(std::move(m), natural_endpoints);
|
||||||
}
|
}
|
||||||
@@ -651,8 +675,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
|||||||
return make_ready_future<>();
|
return make_ready_future<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
mutation m = this->get_mutation(ctx_ptr, buf);
|
auto m = this->get_mutation(ctx_ptr, buf);
|
||||||
gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
|
gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();
|
||||||
|
|
||||||
// The hint is too old - drop it.
|
// The hint is too old - drop it.
|
||||||
//
|
//
|
||||||
@@ -673,10 +697,13 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
|||||||
// ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
|
// ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
|
||||||
} catch (no_such_column_family& e) {
|
} catch (no_such_column_family& e) {
|
||||||
manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
|
manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
|
||||||
|
++this->shard_stats().discarded;
|
||||||
} catch (no_such_keyspace& e) {
|
} catch (no_such_keyspace& e) {
|
||||||
manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
|
manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
|
||||||
|
++this->shard_stats().discarded;
|
||||||
} catch (no_column_mapping& e) {
|
} catch (no_column_mapping& e) {
|
||||||
manager_logger.debug("send_hints(): {}: {}", fname, e.what());
|
manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
|
||||||
|
++this->shard_stats().discarded;
|
||||||
}
|
}
|
||||||
return make_ready_future<>();
|
return make_ready_future<>();
|
||||||
}).finally([units = std::move(units), ctx_ptr] {});
|
}).finally([units = std::move(units), ctx_ptr] {});
|
||||||
@@ -690,10 +717,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
|||||||
bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
|
bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
|
||||||
timespec last_mod = get_last_file_modification(fname).get0();
|
timespec last_mod = get_last_file_modification(fname).get0();
|
||||||
gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
|
gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
|
||||||
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
|
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||||
// Check that we can still send the next hint. Don't try to send it if the destination host
|
// Check that we can still send the next hint. Don't try to send it if the destination host
|
||||||
// is DOWN or if we have already failed to send some of the previous hints.
|
// is DOWN or if we have already failed to send some of the previous hints.
|
||||||
if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
|
if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
|
||||||
@@ -712,6 +739,10 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
|||||||
}, _last_not_complete_rp.pos, &_db.get_config().extensions()).get0();
|
}, _last_not_complete_rp.pos, &_db.get_config().extensions()).get0();
|
||||||
|
|
||||||
s->done().get();
|
s->done().get();
|
||||||
|
} catch (db::commitlog::segment_error& ex) {
|
||||||
|
manager_logger.error("{}: {}. Dropping...", fname, ex.what());
|
||||||
|
ctx_ptr->state.remove(send_state::segment_replay_failed);
|
||||||
|
++this->shard_stats().corrupted_files;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
|
manager_logger.trace("sending of {} failed: {}", fname, std::current_exception());
|
||||||
ctx_ptr->state.set(send_state::segment_replay_failed);
|
ctx_ptr->state.set(send_state::segment_replay_failed);
|
||||||
@@ -747,6 +778,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
|||||||
|
|
||||||
// clear the replay position - we are going to send the next segment...
|
// clear the replay position - we are going to send the next segment...
|
||||||
_last_not_complete_rp = replay_position();
|
_last_not_complete_rp = replay_position();
|
||||||
|
_last_schema_ver_to_column_mapping.clear();
|
||||||
manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
|
manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -759,7 +791,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
|||||||
int replayed_segments_count = 0;
|
int replayed_segments_count = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
while (have_segments()) {
|
while (replay_allowed() && have_segments()) {
|
||||||
if (!send_one_file(*_segments_to_replay.begin())) {
|
if (!send_one_file(*_segments_to_replay.begin())) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -784,14 +816,24 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
|||||||
manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
|
manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Func>
|
||||||
|
static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
|
||||||
|
return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
|
||||||
|
try {
|
||||||
|
return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
|
||||||
|
} catch (std::invalid_argument& ex) {
|
||||||
|
manager_logger.debug("Ignore invalid directory {}", de.name);
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// runs in seastar::async context
|
// runs in seastar::async context
|
||||||
manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
|
manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
|
||||||
hints_segments_map current_hints_segments;
|
hints_segments_map current_hints_segments;
|
||||||
|
|
||||||
// shards level
|
// shards level
|
||||||
lister::scan_dir(hints_directory, { directory_entry_type::directory }, [¤t_hints_segments] (lister::path dir, directory_entry de) {
|
scan_for_hints_dirs(hints_directory, [¤t_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||||
unsigned shard_id = std::stoi(de.name.c_str());
|
|
||||||
|
|
||||||
manager_logger.trace("shard_id = {}", shard_id);
|
manager_logger.trace("shard_id = {}", shard_id);
|
||||||
// IPs level
|
// IPs level
|
||||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [¤t_hints_segments, shard_id] (lister::path dir, directory_entry de) {
|
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [¤t_hints_segments, shard_id] (lister::path dir, directory_entry de) {
|
||||||
@@ -908,9 +950,7 @@ void manager::rebalance_segments_for(
|
|||||||
// runs in seastar::async context
|
// runs in seastar::async context
|
||||||
void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
|
void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
|
||||||
// shards level
|
// shards level
|
||||||
lister::scan_dir(hints_directory, { directory_entry_type::directory }, [] (lister::path dir, directory_entry de) {
|
scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||||
unsigned shard_id = std::stoi(de.name.c_str());
|
|
||||||
|
|
||||||
if (shard_id >= smp::count) {
|
if (shard_id >= smp::count) {
|
||||||
// IPs level
|
// IPs level
|
||||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
|
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
|
||||||
@@ -936,5 +976,13 @@ future<> manager::rebalance(sstring hints_directory) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void manager::update_backlog(size_t backlog, size_t max_backlog) {
|
||||||
|
if (backlog < max_backlog) {
|
||||||
|
allow_hints();
|
||||||
|
} else {
|
||||||
|
forbid_hints_for_eps_with_pending_hints();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,6 +59,8 @@ private:
|
|||||||
uint64_t errors = 0;
|
uint64_t errors = 0;
|
||||||
uint64_t dropped = 0;
|
uint64_t dropped = 0;
|
||||||
uint64_t sent = 0;
|
uint64_t sent = 0;
|
||||||
|
uint64_t discarded = 0;
|
||||||
|
uint64_t corrupted_files = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
// map: shard -> segments
|
// map: shard -> segments
|
||||||
@@ -69,6 +71,8 @@ private:
|
|||||||
class drain_tag {};
|
class drain_tag {};
|
||||||
using drain = seastar::bool_class<drain_tag>;
|
using drain = seastar::bool_class<drain_tag>;
|
||||||
|
|
||||||
|
friend class space_watchdog;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
class end_point_hints_manager {
|
class end_point_hints_manager {
|
||||||
public:
|
public:
|
||||||
@@ -100,7 +104,10 @@ public:
|
|||||||
send_state::restart_segment>>;
|
send_state::restart_segment>>;
|
||||||
|
|
||||||
struct send_one_file_ctx {
|
struct send_one_file_ctx {
|
||||||
std::unordered_map<table_schema_version, column_mapping> schema_ver_to_column_mapping;
|
send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
|
||||||
|
: schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
|
||||||
|
{}
|
||||||
|
std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
|
||||||
seastar::gate file_send_gate;
|
seastar::gate file_send_gate;
|
||||||
std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
|
std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
|
||||||
send_state_set state;
|
send_state_set state;
|
||||||
@@ -109,6 +116,7 @@ public:
|
|||||||
private:
|
private:
|
||||||
std::list<sstring> _segments_to_replay;
|
std::list<sstring> _segments_to_replay;
|
||||||
replay_position _last_not_complete_rp;
|
replay_position _last_not_complete_rp;
|
||||||
|
std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
|
||||||
state_set _state;
|
state_set _state;
|
||||||
future<> _stopped;
|
future<> _stopped;
|
||||||
clock::time_point _next_flush_tp;
|
clock::time_point _next_flush_tp;
|
||||||
@@ -119,6 +127,7 @@ public:
|
|||||||
resource_manager& _resource_manager;
|
resource_manager& _resource_manager;
|
||||||
service::storage_proxy& _proxy;
|
service::storage_proxy& _proxy;
|
||||||
database& _db;
|
database& _db;
|
||||||
|
seastar::scheduling_group _hints_cpu_sched_group;
|
||||||
gms::gossiper& _gossiper;
|
gms::gossiper& _gossiper;
|
||||||
seastar::shared_mutex& _file_update_mutex;
|
seastar::shared_mutex& _file_update_mutex;
|
||||||
|
|
||||||
@@ -179,6 +188,10 @@ public:
|
|||||||
return _state.contains(state::stopping);
|
return _state.contains(state::stopping);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool replay_allowed() const noexcept {
|
||||||
|
return _ep_manager.replay_allowed();
|
||||||
|
}
|
||||||
|
|
||||||
/// \brief Try to send one hint read from the file.
|
/// \brief Try to send one hint read from the file.
|
||||||
/// - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
|
/// - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
|
||||||
/// - Discard the hints that are older than the grace seconds value of the corresponding table.
|
/// - Discard the hints that are older than the grace seconds value of the corresponding table.
|
||||||
@@ -210,7 +223,7 @@ public:
|
|||||||
/// \param ctx_ptr pointer to the send context
|
/// \param ctx_ptr pointer to the send context
|
||||||
/// \param buf hints file entry
|
/// \param buf hints file entry
|
||||||
/// \return The mutation object representing the original mutation stored in the hints file.
|
/// \return The mutation object representing the original mutation stored in the hints file.
|
||||||
mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||||
|
|
||||||
/// \brief Get a reference to the column_mapping object for a given frozen mutation.
|
/// \brief Get a reference to the column_mapping object for a given frozen mutation.
|
||||||
/// \param ctx_ptr pointer to the send context
|
/// \param ctx_ptr pointer to the send context
|
||||||
@@ -227,13 +240,13 @@ public:
|
|||||||
/// \param m mutation to send
|
/// \param m mutation to send
|
||||||
/// \param natural_endpoints current replicas for the given mutation
|
/// \param natural_endpoints current replicas for the given mutation
|
||||||
/// \return future that resolves when the operation is complete
|
/// \return future that resolves when the operation is complete
|
||||||
future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||||
|
|
||||||
/// \brief Send one mutation out.
|
/// \brief Send one mutation out.
|
||||||
///
|
///
|
||||||
/// \param m mutation to send
|
/// \param m mutation to send
|
||||||
/// \return future that resolves when the mutation sending processing is complete.
|
/// \return future that resolves when the mutation sending processing is complete.
|
||||||
future<> send_one_mutation(mutation m);
|
future<> send_one_mutation(frozen_mutation_and_schema m);
|
||||||
|
|
||||||
/// \brief Get the last modification time stamp for a given file.
|
/// \brief Get the last modification time stamp for a given file.
|
||||||
/// \param fname File name
|
/// \param fname File name
|
||||||
@@ -262,7 +275,8 @@ public:
|
|||||||
manager& _shard_manager;
|
manager& _shard_manager;
|
||||||
hints_store_ptr _hints_store_anchor;
|
hints_store_ptr _hints_store_anchor;
|
||||||
seastar::gate _store_gate;
|
seastar::gate _store_gate;
|
||||||
seastar::shared_mutex _file_update_mutex;
|
lw_shared_ptr<seastar::shared_mutex> _file_update_mutex_ptr;
|
||||||
|
seastar::shared_mutex& _file_update_mutex;
|
||||||
|
|
||||||
enum class state {
|
enum class state {
|
||||||
can_hint, // hinting is currently allowed (used by the space_watchdog)
|
can_hint, // hinting is currently allowed (used by the space_watchdog)
|
||||||
@@ -328,6 +342,10 @@ public:
|
|||||||
return _hints_in_progress;
|
return _hints_in_progress;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool replay_allowed() const noexcept {
|
||||||
|
return _shard_manager.replay_allowed();
|
||||||
|
}
|
||||||
|
|
||||||
bool can_hint() const noexcept {
|
bool can_hint() const noexcept {
|
||||||
return _state.contains(state::can_hint);
|
return _state.contains(state::can_hint);
|
||||||
}
|
}
|
||||||
@@ -360,8 +378,20 @@ public:
|
|||||||
return _state.contains(state::stopped);
|
return _state.contains(state::stopped);
|
||||||
}
|
}
|
||||||
|
|
||||||
seastar::shared_mutex& file_update_mutex() {
|
/// \brief Safely runs a given functor under the file_update_mutex of \ref ep_man
|
||||||
return _file_update_mutex;
|
///
|
||||||
|
/// Runs a given functor under the file_update_mutex of the given end_point_hints_manager instance.
|
||||||
|
/// This function is safe even if \ref ep_man gets destroyed before the future this function returns resolves
|
||||||
|
/// (as long as the \ref func call itself is safe).
|
||||||
|
///
|
||||||
|
/// \tparam Func Functor type.
|
||||||
|
/// \param ep_man end_point_hints_manager instance which file_update_mutex we want to lock.
|
||||||
|
/// \param func Functor to run under the lock.
|
||||||
|
/// \return Whatever \ref func returns.
|
||||||
|
template <typename Func>
|
||||||
|
friend inline auto with_file_update_mutex(end_point_hints_manager& ep_man, Func&& func) {
|
||||||
|
lw_shared_ptr<seastar::shared_mutex> lock_ptr = ep_man._file_update_mutex_ptr;
|
||||||
|
return with_lock(*lock_ptr, std::forward<Func>(func)).finally([lock_ptr] {});
|
||||||
}
|
}
|
||||||
|
|
||||||
const boost::filesystem::path& hints_dir() const noexcept {
|
const boost::filesystem::path& hints_dir() const noexcept {
|
||||||
@@ -369,6 +399,10 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
seastar::shared_mutex& file_update_mutex() noexcept {
|
||||||
|
return _file_update_mutex;
|
||||||
|
}
|
||||||
|
|
||||||
/// \brief Creates a new hints store object.
|
/// \brief Creates a new hints store object.
|
||||||
///
|
///
|
||||||
/// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
|
/// - Creates a hints store directory if doesn't exist: <shard_hints_dir>/<ep_key>
|
||||||
@@ -393,6 +427,17 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum class state {
|
||||||
|
started, // hinting is currently allowed (start() call is complete)
|
||||||
|
replay_allowed, // replaying (hints sending) is allowed
|
||||||
|
stopping // hinting is not allowed - stopping is in progress (stop() method has been called)
|
||||||
|
};
|
||||||
|
|
||||||
|
using state_set = enum_set<super_enum<state,
|
||||||
|
state::started,
|
||||||
|
state::replay_allowed,
|
||||||
|
state::stopping>>;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
using ep_key_type = typename end_point_hints_manager::key_type;
|
using ep_key_type = typename end_point_hints_manager::key_type;
|
||||||
using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
|
using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
|
||||||
@@ -403,6 +448,7 @@ public:
|
|||||||
static const std::chrono::seconds hint_file_write_timeout;
|
static const std::chrono::seconds hint_file_write_timeout;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
state_set _state;
|
||||||
const boost::filesystem::path _hints_dir;
|
const boost::filesystem::path _hints_dir;
|
||||||
dev_t _hints_dir_device_id = 0;
|
dev_t _hints_dir_device_id = 0;
|
||||||
|
|
||||||
@@ -414,7 +460,7 @@ private:
|
|||||||
locator::snitch_ptr& _local_snitch_ptr;
|
locator::snitch_ptr& _local_snitch_ptr;
|
||||||
int64_t _max_hint_window_us = 0;
|
int64_t _max_hint_window_us = 0;
|
||||||
database& _local_db;
|
database& _local_db;
|
||||||
bool _stopping = false;
|
|
||||||
seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call
|
seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call
|
||||||
|
|
||||||
resource_manager& _resource_manager;
|
resource_manager& _resource_manager;
|
||||||
@@ -423,10 +469,13 @@ private:
|
|||||||
stats _stats;
|
stats _stats;
|
||||||
seastar::metrics::metric_groups _metrics;
|
seastar::metrics::metric_groups _metrics;
|
||||||
std::unordered_set<ep_key_type> _eps_with_pending_hints;
|
std::unordered_set<ep_key_type> _eps_with_pending_hints;
|
||||||
|
seastar::semaphore _drain_lock = {1};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
|
manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
|
||||||
virtual ~manager();
|
virtual ~manager();
|
||||||
|
manager(manager&&) = delete;
|
||||||
|
manager& operator=(manager&&) = delete;
|
||||||
void register_metrics(const sstring& group_name);
|
void register_metrics(const sstring& group_name);
|
||||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||||
future<> stop();
|
future<> stop();
|
||||||
@@ -499,10 +548,18 @@ public:
|
|||||||
return _hints_dir_device_id;
|
return _hints_dir_device_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seastar::semaphore& drain_lock() noexcept {
|
||||||
|
return _drain_lock;
|
||||||
|
}
|
||||||
|
|
||||||
void allow_hints();
|
void allow_hints();
|
||||||
void forbid_hints();
|
void forbid_hints();
|
||||||
void forbid_hints_for_eps_with_pending_hints();
|
void forbid_hints_for_eps_with_pending_hints();
|
||||||
|
|
||||||
|
void allow_replaying() noexcept {
|
||||||
|
_state.set(state::replay_allowed);
|
||||||
|
}
|
||||||
|
|
||||||
/// \brief Rebalance hints segments among all present shards.
|
/// \brief Rebalance hints segments among all present shards.
|
||||||
///
|
///
|
||||||
/// The difference between the number of segments on every two shard will be not greater than 1 after the
|
/// The difference between the number of segments on every two shard will be not greater than 1 after the
|
||||||
@@ -616,6 +673,28 @@ private:
|
|||||||
/// \param endpoint node that left the cluster
|
/// \param endpoint node that left the cluster
|
||||||
void drain_for(gms::inet_address endpoint);
|
void drain_for(gms::inet_address endpoint);
|
||||||
|
|
||||||
|
void update_backlog(size_t backlog, size_t max_backlog);
|
||||||
|
|
||||||
|
bool stopping() const noexcept {
|
||||||
|
return _state.contains(state::stopping);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_stopping() noexcept {
|
||||||
|
_state.set(state::stopping);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool started() const noexcept {
|
||||||
|
return _state.contains(state::started);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_started() noexcept {
|
||||||
|
_state.set(state::started);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool replay_allowed() const noexcept {
|
||||||
|
return _state.contains(state::replay_allowed);
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
|
ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
|
||||||
return _ep_managers.find(ep_key);
|
return _ep_managers.find(ep_key);
|
||||||
|
|||||||
@@ -27,6 +27,7 @@
|
|||||||
#include "lister.hh"
|
#include "lister.hh"
|
||||||
#include "disk-error-handler.hh"
|
#include "disk-error-handler.hh"
|
||||||
#include "seastarx.hh"
|
#include "seastarx.hh"
|
||||||
|
#include <seastar/core/sleep.hh>
|
||||||
|
|
||||||
namespace db {
|
namespace db {
|
||||||
namespace hints {
|
namespace hints {
|
||||||
@@ -65,112 +66,111 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
|
|||||||
space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
|
space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
|
||||||
: _shard_managers(managers)
|
: _shard_managers(managers)
|
||||||
, _per_device_limits_map(per_device_limits_map)
|
, _per_device_limits_map(per_device_limits_map)
|
||||||
, _timer([this] { on_timer(); })
|
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void space_watchdog::start() {
|
void space_watchdog::start() {
|
||||||
_timer.arm(timer_clock_type::now());
|
_started = seastar::async([this] {
|
||||||
|
while (!_as.abort_requested()) {
|
||||||
|
try {
|
||||||
|
on_timer();
|
||||||
|
} catch (...) {
|
||||||
|
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||||
|
// Stop all hint generators if space_watchdog callback failed
|
||||||
|
for (manager& shard_manager : _shard_managers) {
|
||||||
|
shard_manager.forbid_hints();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
seastar::sleep_abortable(_watchdog_period, _as).get();
|
||||||
|
}
|
||||||
|
}).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> space_watchdog::stop() noexcept {
|
future<> space_watchdog::stop() noexcept {
|
||||||
try {
|
_as.request_abort();
|
||||||
return _gate.close().finally([this] { _timer.cancel(); });
|
return std::move(_started);
|
||||||
} catch (...) {
|
|
||||||
return make_exception_future<>(std::current_exception());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Called under the end_point_hints_manager::file_update_mutex() of the corresponding end_point_hints_manager instance.
|
||||||
future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
|
future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
|
||||||
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
|
return do_with(std::move(path), [this, ep_key, &shard_manager] (boost::filesystem::path& path) {
|
||||||
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
|
// It may happen that we get here and the directory has already been deleted in the context of manager::drain_for().
|
||||||
if (_files_count == 1) {
|
// In this case simply bail out.
|
||||||
shard_manager.add_ep_with_pending_hints(ep_key);
|
return engine().file_exists(path.native()).then([this, ep_key, &shard_manager, &path] (bool exists) {
|
||||||
}
|
if (!exists) {
|
||||||
++_files_count;
|
return make_ready_future<>();
|
||||||
|
} else {
|
||||||
|
return lister::scan_dir(path, { directory_entry_type::regular }, [this, ep_key, &shard_manager] (lister::path dir, directory_entry de) {
|
||||||
|
// Put the current end point ID to state.eps_with_pending_hints when we see the second hints file in its directory
|
||||||
|
if (_files_count == 1) {
|
||||||
|
shard_manager.add_ep_with_pending_hints(ep_key);
|
||||||
|
}
|
||||||
|
++_files_count;
|
||||||
|
|
||||||
return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
|
return io_check(file_size, (dir / de.name.c_str()).c_str()).then([this] (uint64_t fsize) {
|
||||||
_total_size += fsize;
|
_total_size += fsize;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Called from the context of a seastar::thread.
|
||||||
void space_watchdog::on_timer() {
|
void space_watchdog::on_timer() {
|
||||||
with_gate(_gate, [this] {
|
// The hints directories are organized as follows:
|
||||||
return futurize_apply([this] {
|
// <hints root>
|
||||||
_total_size = 0;
|
// |- <shard1 ID>
|
||||||
|
// | |- <EP1 address>
|
||||||
|
// | |- <hints file1>
|
||||||
|
// | |- <hints file2>
|
||||||
|
// | |- ...
|
||||||
|
// | |- <EP2 address>
|
||||||
|
// | |- ...
|
||||||
|
// | |-...
|
||||||
|
// |- <shard2 ID>
|
||||||
|
// | |- ...
|
||||||
|
// ...
|
||||||
|
// |- <shardN ID>
|
||||||
|
// | |- ...
|
||||||
|
//
|
||||||
|
|
||||||
return do_for_each(_shard_managers, [this] (manager& shard_manager) {
|
for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
|
||||||
shard_manager.clear_eps_with_pending_hints();
|
_total_size = 0;
|
||||||
|
for (manager& shard_manager : per_device_limits.managers) {
|
||||||
// The hints directories are organized as follows:
|
shard_manager.clear_eps_with_pending_hints();
|
||||||
// <hints root>
|
lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||||
// |- <shard1 ID>
|
_files_count = 0;
|
||||||
// | |- <EP1 address>
|
// Let's scan per-end-point directories and enumerate hints files...
|
||||||
// | |- <hints file1>
|
|
||||||
// | |- <hints file2>
|
|
||||||
// | |- ...
|
|
||||||
// | |- <EP2 address>
|
|
||||||
// | |- ...
|
|
||||||
// | |-...
|
|
||||||
// |- <shard2 ID>
|
|
||||||
// | |- ...
|
|
||||||
// ...
|
|
||||||
// |- <shardN ID>
|
|
||||||
// | |- ...
|
|
||||||
//
|
//
|
||||||
return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||||
_files_count = 0;
|
// not hintable).
|
||||||
// Let's scan per-end-point directories and enumerate hints files...
|
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||||
//
|
// continue to enumeration - there is no one to change them.
|
||||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
auto it = shard_manager.find_ep_manager(de.name);
|
||||||
// not hintable).
|
if (it != shard_manager.ep_managers_end()) {
|
||||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
return with_file_update_mutex(it->second, [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)] () mutable {
|
||||||
// continue to enumeration - there is no one to change them.
|
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||||
auto it = shard_manager.find_ep_manager(de.name);
|
|
||||||
if (it != shard_manager.ep_managers_end()) {
|
|
||||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
|
||||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}).then([this] {
|
|
||||||
return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
|
|
||||||
space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
|
|
||||||
|
|
||||||
size_t adjusted_quota = 0;
|
|
||||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
|
||||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
|
||||||
});
|
});
|
||||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
} else {
|
||||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||||
}
|
}
|
||||||
|
}).get();
|
||||||
|
}
|
||||||
|
|
||||||
bool can_hint = _total_size < adjusted_quota;
|
// Adjust the quota to take into account the space we guarantee to every end point manager
|
||||||
resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
|
size_t adjusted_quota = 0;
|
||||||
|
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||||
if (!can_hint) {
|
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||||
for (manager& shard_manager : per_device_limits.managers) {
|
|
||||||
shard_manager.forbid_hints_for_eps_with_pending_hints();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (manager& shard_manager : per_device_limits.managers) {
|
|
||||||
shard_manager.allow_hints();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}).handle_exception([this] (auto eptr) {
|
|
||||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
|
||||||
// Stop all hint generators if space_watchdog callback failed
|
|
||||||
for (manager& shard_manager : _shard_managers) {
|
|
||||||
shard_manager.forbid_hints();
|
|
||||||
}
|
|
||||||
}).finally([this] {
|
|
||||||
_timer.arm(_watchdog_period);
|
|
||||||
});
|
});
|
||||||
});
|
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||||
|
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||||
|
}
|
||||||
|
|
||||||
|
resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
|
||||||
|
for (manager& shard_manager : per_device_limits.managers) {
|
||||||
|
shard_manager.update_backlog(_total_size, adjusted_quota);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
|
future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
|
||||||
@@ -183,6 +183,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void resource_manager::allow_replaying() noexcept {
|
||||||
|
boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
|
||||||
|
}
|
||||||
|
|
||||||
future<> resource_manager::stop() noexcept {
|
future<> resource_manager::stop() noexcept {
|
||||||
return parallel_for_each(_shard_managers, [](manager& m) {
|
return parallel_for_each(_shard_managers, [](manager& m) {
|
||||||
return m.stop();
|
return m.stop();
|
||||||
@@ -201,14 +205,18 @@ future<> resource_manager::prepare_per_device_limits() {
|
|||||||
auto it = _per_device_limits_map.find(device_id);
|
auto it = _per_device_limits_map.find(device_id);
|
||||||
if (it == _per_device_limits_map.end()) {
|
if (it == _per_device_limits_map.end()) {
|
||||||
return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
|
return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
|
||||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
|
||||||
size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
// Since we possibly deferred, we need to recheck the _per_device_limits_map.
|
||||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
if (inserted) {
|
||||||
// Then, reserve 90% of all space instead of 10% above.
|
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||||
if (is_mountpoint) {
|
it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||||
max_size *= 9;
|
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||||
|
// Then, reserve 90% of all space instead of 10% above.
|
||||||
|
if (is_mountpoint) {
|
||||||
|
it->second.max_shard_disk_space_size *= 9;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
|
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||||
|
|||||||
@@ -22,6 +22,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <seastar/core/abort_source.hh>
|
||||||
#include <seastar/core/semaphore.hh>
|
#include <seastar/core/semaphore.hh>
|
||||||
#include <seastar/core/gate.hh>
|
#include <seastar/core/gate.hh>
|
||||||
#include <seastar/core/memory.hh>
|
#include <seastar/core/memory.hh>
|
||||||
@@ -78,8 +79,8 @@ private:
|
|||||||
shard_managers_set& _shard_managers;
|
shard_managers_set& _shard_managers;
|
||||||
per_device_limits_map& _per_device_limits_map;
|
per_device_limits_map& _per_device_limits_map;
|
||||||
|
|
||||||
seastar::gate _gate;
|
future<> _started = make_ready_future<>();
|
||||||
seastar::timer<timer_clock_type> _timer;
|
seastar::abort_source _as;
|
||||||
int _files_count = 0;
|
int _files_count = 0;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@@ -137,6 +138,9 @@ public:
|
|||||||
, _space_watchdog(_shard_managers, _per_device_limits_map)
|
, _space_watchdog(_shard_managers, _per_device_limits_map)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
|
resource_manager(resource_manager&&) = delete;
|
||||||
|
resource_manager& operator=(resource_manager&&) = delete;
|
||||||
|
|
||||||
future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);
|
future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);
|
||||||
|
|
||||||
bool too_many_hints_in_progress() const {
|
bool too_many_hints_in_progress() const {
|
||||||
@@ -156,6 +160,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||||
|
void allow_replaying() noexcept;
|
||||||
future<> stop() noexcept;
|
future<> stop() noexcept;
|
||||||
void register_manager(manager& m);
|
void register_manager(manager& m);
|
||||||
future<> prepare_per_device_limits();
|
future<> prepare_per_device_limits();
|
||||||
|
|||||||
@@ -598,7 +598,7 @@ public:
|
|||||||
|
|
||||||
future<> flush_schemas() {
|
future<> flush_schemas() {
|
||||||
return _qp.proxy().get_db().invoke_on_all([this] (database& db) {
|
return _qp.proxy().get_db().invoke_on_all([this] (database& db) {
|
||||||
return parallel_for_each(db::schema_tables::ALL, [this, &db](const sstring& cf_name) {
|
return parallel_for_each(db::schema_tables::all_table_names(), [this, &db](const sstring& cf_name) {
|
||||||
auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
|
auto& cf = db.find_column_family(db::schema_tables::NAME, cf_name);
|
||||||
return cf.flush();
|
return cf.flush();
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -143,10 +143,10 @@ struct qualified_name {
|
|||||||
static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s);
|
static future<schema_mutations> read_table_mutations(distributed<service::storage_proxy>& proxy, const qualified_name& table, schema_ptr s);
|
||||||
|
|
||||||
static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||||
std::map<qualified_name, schema_mutations>&& tables_before,
|
std::map<utils::UUID, schema_mutations>&& tables_before,
|
||||||
std::map<qualified_name, schema_mutations>&& tables_after,
|
std::map<utils::UUID, schema_mutations>&& tables_after,
|
||||||
std::map<qualified_name, schema_mutations>&& views_before,
|
std::map<utils::UUID, schema_mutations>&& views_before,
|
||||||
std::map<qualified_name, schema_mutations>&& views_after);
|
std::map<utils::UUID, schema_mutations>&& views_after);
|
||||||
|
|
||||||
struct user_types_to_drop final {
|
struct user_types_to_drop final {
|
||||||
seastar::noncopyable_function<void()> drop;
|
seastar::noncopyable_function<void()> drop;
|
||||||
@@ -194,8 +194,6 @@ static void prepare_builder_from_table_row(const schema_ctxt&, schema_builder&,
|
|||||||
|
|
||||||
using namespace v3;
|
using namespace v3;
|
||||||
|
|
||||||
std::vector<const char*> ALL { KEYSPACES, TABLES, SCYLLA_TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES };
|
|
||||||
|
|
||||||
using days = std::chrono::duration<int, std::ratio<24 * 3600>>;
|
using days = std::chrono::duration<int, std::ratio<24 * 3600>>;
|
||||||
|
|
||||||
future<> save_system_schema(const sstring & ksname) {
|
future<> save_system_schema(const sstring & ksname) {
|
||||||
@@ -203,7 +201,7 @@ future<> save_system_schema(const sstring & ksname) {
|
|||||||
auto ksm = ks.metadata();
|
auto ksm = ks.metadata();
|
||||||
|
|
||||||
// delete old, possibly obsolete entries in schema tables
|
// delete old, possibly obsolete entries in schema tables
|
||||||
return parallel_for_each(ALL, [ksm] (sstring cf) {
|
return parallel_for_each(all_table_names(), [ksm] (sstring cf) {
|
||||||
auto deletion_timestamp = schema_creation_timestamp() - 1;
|
auto deletion_timestamp = schema_creation_timestamp() - 1;
|
||||||
return db::execute_cql(sprint("DELETE FROM %s.%s USING TIMESTAMP %s WHERE keyspace_name = ?", NAME, cf,
|
return db::execute_cql(sprint("DELETE FROM %s.%s USING TIMESTAMP %s WHERE keyspace_name = ?", NAME, cf,
|
||||||
deletion_timestamp), ksm->name()).discard_result();
|
deletion_timestamp), ksm->name()).discard_result();
|
||||||
@@ -598,7 +596,7 @@ future<utils::UUID> calculate_schema_digest(distributed<service::storage_proxy>&
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
return do_with(md5_hasher(), [map, reduce] (auto& hash) {
|
return do_with(md5_hasher(), [map, reduce] (auto& hash) {
|
||||||
return do_for_each(ALL.begin(), ALL.end(), [&hash, map, reduce] (auto& table) {
|
return do_for_each(all_table_names(), [&hash, map, reduce] (auto& table) {
|
||||||
return map(table).then([&hash, reduce] (auto&& mutations) {
|
return map(table).then([&hash, reduce] (auto&& mutations) {
|
||||||
reduce(hash, mutations);
|
reduce(hash, mutations);
|
||||||
});
|
});
|
||||||
@@ -629,7 +627,7 @@ future<std::vector<frozen_mutation>> convert_schema_to_mutations(distributed<ser
|
|||||||
std::move(mutations.begin(), mutations.end(), std::back_inserter(result));
|
std::move(mutations.begin(), mutations.end(), std::back_inserter(result));
|
||||||
return std::move(result);
|
return std::move(result);
|
||||||
};
|
};
|
||||||
return map_reduce(ALL.begin(), ALL.end(), map, std::vector<frozen_mutation>{}, reduce);
|
return map_reduce(all_table_names(), map, std::vector<frozen_mutation>{}, reduce);
|
||||||
}
|
}
|
||||||
|
|
||||||
future<schema_result>
|
future<schema_result>
|
||||||
@@ -703,33 +701,7 @@ read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring
|
|||||||
static semaphore the_merge_lock {1};
|
static semaphore the_merge_lock {1};
|
||||||
|
|
||||||
future<> merge_lock() {
|
future<> merge_lock() {
|
||||||
// ref: #1088
|
return smp::submit_to(0, [] { return the_merge_lock.wait(); });
|
||||||
// to avoid deadlocks, we don't want long-standing calls to the shard 0
|
|
||||||
// as they can cause a deadlock:
|
|
||||||
//
|
|
||||||
// fiber1 fiber2
|
|
||||||
// merge_lock() (succeeds)
|
|
||||||
// merge_lock() (waits)
|
|
||||||
// invoke_on_all() (waits on merge_lock to relinquish smp::submit_to slot)
|
|
||||||
//
|
|
||||||
// so we issue the lock calls with a timeout; the slot will be relinquished, and invoke_on_all()
|
|
||||||
// can complete
|
|
||||||
return repeat([] () mutable {
|
|
||||||
return smp::submit_to(0, [] {
|
|
||||||
return the_merge_lock.try_wait();
|
|
||||||
}).then([] (bool result) {
|
|
||||||
if (result) {
|
|
||||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
|
||||||
} else {
|
|
||||||
static thread_local auto rand_engine = std::default_random_engine();
|
|
||||||
auto dist = std::uniform_int_distribution<int>(0, 100);
|
|
||||||
auto to = std::chrono::microseconds(dist(rand_engine));
|
|
||||||
return sleep(to).then([] {
|
|
||||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> merge_unlock() {
|
future<> merge_unlock() {
|
||||||
@@ -777,16 +749,24 @@ static read_table_names_of_keyspace(distributed<service::storage_proxy>& proxy,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static utils::UUID table_id_from_mutations(const schema_mutations& sm) {
|
||||||
|
auto table_rs = query::result_set(sm.columnfamilies_mutation());
|
||||||
|
query::result_set_row table_row = table_rs.row(0);
|
||||||
|
return table_row.get_nonnull<utils::UUID>("id");
|
||||||
|
}
|
||||||
|
|
||||||
// Call inside a seastar thread
|
// Call inside a seastar thread
|
||||||
static
|
static
|
||||||
std::map<qualified_name, schema_mutations>
|
std::map<utils::UUID, schema_mutations>
|
||||||
read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, schema_ptr s)
|
read_tables_for_keyspaces(distributed<service::storage_proxy>& proxy, const std::set<sstring>& keyspace_names, schema_ptr s)
|
||||||
{
|
{
|
||||||
std::map<qualified_name, schema_mutations> result;
|
std::map<utils::UUID, schema_mutations> result;
|
||||||
for (auto&& keyspace_name : keyspace_names) {
|
for (auto&& keyspace_name : keyspace_names) {
|
||||||
for (auto&& table_name : read_table_names_of_keyspace(proxy, keyspace_name, s).get0()) {
|
for (auto&& table_name : read_table_names_of_keyspace(proxy, keyspace_name, s).get0()) {
|
||||||
auto qn = qualified_name(keyspace_name, table_name);
|
auto qn = qualified_name(keyspace_name, table_name);
|
||||||
result.emplace(qn, read_table_mutations(proxy, qn, s).get0());
|
auto muts = read_table_mutations(proxy, qn, s).get0();
|
||||||
|
auto id = table_id_from_mutations(muts);
|
||||||
|
result.emplace(std::move(id), std::move(muts));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
@@ -956,14 +936,14 @@ struct schema_diff {
|
|||||||
|
|
||||||
template<typename CreateSchema>
|
template<typename CreateSchema>
|
||||||
static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy,
|
static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy,
|
||||||
std::map<qualified_name, schema_mutations>&& before,
|
std::map<utils::UUID, schema_mutations>&& before,
|
||||||
std::map<qualified_name, schema_mutations>&& after,
|
std::map<utils::UUID, schema_mutations>&& after,
|
||||||
CreateSchema&& create_schema)
|
CreateSchema&& create_schema)
|
||||||
{
|
{
|
||||||
schema_diff d;
|
schema_diff d;
|
||||||
auto diff = difference(before, after);
|
auto diff = difference(before, after);
|
||||||
for (auto&& key : diff.entries_only_on_left) {
|
for (auto&& key : diff.entries_only_on_left) {
|
||||||
auto&& s = proxy.local().get_db().local().find_schema(key.keyspace_name, key.table_name);
|
auto&& s = proxy.local().get_db().local().find_schema(key);
|
||||||
slogger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
|
slogger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
|
||||||
d.dropped.emplace_back(schema_diff::dropped_schema{s});
|
d.dropped.emplace_back(schema_diff::dropped_schema{s});
|
||||||
}
|
}
|
||||||
@@ -986,10 +966,10 @@ static schema_diff diff_table_or_view(distributed<service::storage_proxy>& proxy
|
|||||||
// upon an alter table or alter type statement), then they are published together
|
// upon an alter table or alter type statement), then they are published together
|
||||||
// as well, without any deferring in-between.
|
// as well, without any deferring in-between.
|
||||||
static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||||
std::map<qualified_name, schema_mutations>&& tables_before,
|
std::map<utils::UUID, schema_mutations>&& tables_before,
|
||||||
std::map<qualified_name, schema_mutations>&& tables_after,
|
std::map<utils::UUID, schema_mutations>&& tables_after,
|
||||||
std::map<qualified_name, schema_mutations>&& views_before,
|
std::map<utils::UUID, schema_mutations>&& views_before,
|
||||||
std::map<qualified_name, schema_mutations>&& views_after)
|
std::map<utils::UUID, schema_mutations>&& views_after)
|
||||||
{
|
{
|
||||||
auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), [&] (auto&& sm) {
|
auto tables_diff = diff_table_or_view(proxy, std::move(tables_before), std::move(tables_after), [&] (auto&& sm) {
|
||||||
return create_table_from_mutations(proxy, std::move(sm));
|
return create_table_from_mutations(proxy, std::move(sm));
|
||||||
@@ -1000,6 +980,10 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
|||||||
|
|
||||||
proxy.local().get_db().invoke_on_all([&] (database& db) {
|
proxy.local().get_db().invoke_on_all([&] (database& db) {
|
||||||
return seastar::async([&] {
|
return seastar::async([&] {
|
||||||
|
parallel_for_each(boost::range::join(tables_diff.dropped, views_diff.dropped), [&] (schema_diff::dropped_schema& dt) {
|
||||||
|
auto& s = *dt.schema.get();
|
||||||
|
return db.drop_column_family(s.ks_name(), s.cf_name(), [&] { return dt.jp.value(); });
|
||||||
|
}).get();
|
||||||
parallel_for_each(boost::range::join(tables_diff.created, views_diff.created), [&] (global_schema_ptr& gs) {
|
parallel_for_each(boost::range::join(tables_diff.created, views_diff.created), [&] (global_schema_ptr& gs) {
|
||||||
return db.add_column_family_and_make_directory(gs);
|
return db.add_column_family_and_make_directory(gs);
|
||||||
}).get();
|
}).get();
|
||||||
@@ -1011,10 +995,6 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
|||||||
for (auto&& gs : boost::range::join(tables_diff.altered, views_diff.altered)) {
|
for (auto&& gs : boost::range::join(tables_diff.altered, views_diff.altered)) {
|
||||||
columns_changed.push_back(db.update_column_family(gs));
|
columns_changed.push_back(db.update_column_family(gs));
|
||||||
}
|
}
|
||||||
parallel_for_each(boost::range::join(tables_diff.dropped, views_diff.dropped), [&] (schema_diff::dropped_schema& dt) {
|
|
||||||
auto& s = *dt.schema.get();
|
|
||||||
return db.drop_column_family(s.ks_name(), s.cf_name(), [&] { return dt.jp.value(); });
|
|
||||||
}).get();
|
|
||||||
|
|
||||||
auto& mm = service::get_local_migration_manager();
|
auto& mm = service::get_local_migration_manager();
|
||||||
auto it = columns_changed.begin();
|
auto it = columns_changed.begin();
|
||||||
@@ -2681,12 +2661,22 @@ data_type parse_type(sstring str)
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<schema_ptr> all_tables() {
|
std::vector<schema_ptr> all_tables() {
|
||||||
|
// Don't forget to update this list when new schema tables are added.
|
||||||
|
// The listed schema tables are the ones synchronized between nodes,
|
||||||
|
// and forgetting one of them in this list can cause bugs like #4339.
|
||||||
return {
|
return {
|
||||||
keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
|
keyspaces(), tables(), scylla_tables(), columns(), dropped_columns(), triggers(),
|
||||||
views(), indexes(), types(), functions(), aggregates(), view_virtual_columns()
|
views(), indexes(), types(), functions(), aggregates(), view_virtual_columns()
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::vector<sstring>& all_table_names() {
|
||||||
|
static thread_local std::vector<sstring> all =
|
||||||
|
boost::copy_range<std::vector<sstring>>(all_tables() |
|
||||||
|
boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
|
||||||
|
return all;
|
||||||
|
}
|
||||||
|
|
||||||
namespace legacy {
|
namespace legacy {
|
||||||
|
|
||||||
table_schema_version schema_mutations::digest() const {
|
table_schema_version schema_mutations::digest() const {
|
||||||
|
|||||||
@@ -127,9 +127,8 @@ using namespace v3;
|
|||||||
// Replication of schema between nodes with different version is inhibited.
|
// Replication of schema between nodes with different version is inhibited.
|
||||||
extern const sstring version;
|
extern const sstring version;
|
||||||
|
|
||||||
extern std::vector<const char*> ALL;
|
|
||||||
|
|
||||||
std::vector<schema_ptr> all_tables();
|
std::vector<schema_ptr> all_tables();
|
||||||
|
const std::vector<sstring>& all_table_names();
|
||||||
|
|
||||||
// saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
|
// saves/creates "ks" + all tables etc, while first deleting all old schema entries (will be rewritten)
|
||||||
future<> save_system_schema(const sstring & ks);
|
future<> save_system_schema(const sstring & ks);
|
||||||
|
|||||||
329
db/size_estimates_virtual_reader.cc
Normal file
329
db/size_estimates_virtual_reader.cc
Normal file
@@ -0,0 +1,329 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2019 ScyllaDB
|
||||||
|
*
|
||||||
|
* Modified by ScyllaDB
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Scylla.
|
||||||
|
*
|
||||||
|
* Scylla is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Scylla is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <boost/range/adaptor/indirected.hpp>
|
||||||
|
#include <boost/range/adaptor/map.hpp>
|
||||||
|
#include <boost/range/adaptor/transformed.hpp>
|
||||||
|
#include <boost/range/algorithm/find_if.hpp>
|
||||||
|
|
||||||
|
#include "clustering_bounds_comparator.hh"
|
||||||
|
#include "database.hh"
|
||||||
|
#include "db/system_keyspace.hh"
|
||||||
|
#include "dht/i_partitioner.hh"
|
||||||
|
#include "partition_range_compat.hh"
|
||||||
|
#include "range.hh"
|
||||||
|
#include "service/storage_service.hh"
|
||||||
|
#include "stdx.hh"
|
||||||
|
#include "mutation_fragment.hh"
|
||||||
|
#include "sstables/sstables.hh"
|
||||||
|
#include "db/timeout_clock.hh"
|
||||||
|
#include "database.hh"
|
||||||
|
|
||||||
|
#include "db/size_estimates_virtual_reader.hh"
|
||||||
|
|
||||||
|
namespace db {
|
||||||
|
|
||||||
|
namespace size_estimates {
|
||||||
|
|
||||||
|
struct virtual_row {
|
||||||
|
const bytes& cf_name;
|
||||||
|
const token_range& tokens;
|
||||||
|
clustering_key_prefix as_key() const {
|
||||||
|
return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct virtual_row_comparator {
|
||||||
|
schema_ptr _schema;
|
||||||
|
virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
|
||||||
|
bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
|
||||||
|
return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
|
||||||
|
}
|
||||||
|
bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
|
||||||
|
return operator()(row.as_key(), key);
|
||||||
|
}
|
||||||
|
bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
|
||||||
|
return operator()(key, row.as_key());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Iterating over the cartesian product of cf_names and token_ranges.
|
||||||
|
class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
|
||||||
|
std::reference_wrapper<const std::vector<bytes>> _cf_names;
|
||||||
|
std::reference_wrapper<const std::vector<token_range>> _ranges;
|
||||||
|
size_t _cf_names_idx = 0;
|
||||||
|
size_t _ranges_idx = 0;
|
||||||
|
public:
|
||||||
|
struct end_iterator_tag {};
|
||||||
|
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
|
||||||
|
: _cf_names(std::ref(cf_names))
|
||||||
|
, _ranges(std::ref(ranges))
|
||||||
|
{ }
|
||||||
|
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
|
||||||
|
: _cf_names(std::ref(cf_names))
|
||||||
|
, _ranges(std::ref(ranges))
|
||||||
|
, _cf_names_idx(cf_names.size())
|
||||||
|
, _ranges_idx(ranges.size())
|
||||||
|
{
|
||||||
|
if (cf_names.empty() || ranges.empty()) {
|
||||||
|
// The product of an empty range with any range is an empty range.
|
||||||
|
// In this case we want the end iterator to be equal to the begin iterator,
|
||||||
|
// which has_ranges_idx = _cf_names_idx = 0.
|
||||||
|
_ranges_idx = _cf_names_idx = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual_row_iterator& operator++() {
|
||||||
|
if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
|
||||||
|
_ranges_idx = 0;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
virtual_row_iterator operator++(int) {
|
||||||
|
virtual_row_iterator i(*this);
|
||||||
|
++(*this);
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
const value_type operator*() const {
|
||||||
|
return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
|
||||||
|
}
|
||||||
|
bool operator==(const virtual_row_iterator& i) const {
|
||||||
|
return _cf_names_idx == i._cf_names_idx
|
||||||
|
&& _ranges_idx == i._ranges_idx;
|
||||||
|
}
|
||||||
|
bool operator!=(const virtual_row_iterator& i) const {
|
||||||
|
return !(*this == i);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the keyspaces, ordered by name, as selected by the partition_range.
|
||||||
|
*/
|
||||||
|
static std::vector<sstring> get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
|
||||||
|
struct keyspace_less_comparator {
|
||||||
|
const schema& _s;
|
||||||
|
keyspace_less_comparator(const schema& s) : _s(s) { }
|
||||||
|
dht::ring_position as_ring_position(const sstring& ks) {
|
||||||
|
auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
|
||||||
|
return dht::global_partitioner().decorate_key(_s, std::move(pkey));
|
||||||
|
}
|
||||||
|
bool operator()(const sstring& ks1, const sstring& ks2) {
|
||||||
|
return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
|
||||||
|
}
|
||||||
|
bool operator()(const sstring& ks, const dht::ring_position& rp) {
|
||||||
|
return as_ring_position(ks).less_compare(_s, rp);
|
||||||
|
}
|
||||||
|
bool operator()(const dht::ring_position& rp, const sstring& ks) {
|
||||||
|
return rp.less_compare(_s, as_ring_position(ks));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
auto keyspaces = db.get_non_system_keyspaces();
|
||||||
|
auto cmp = keyspace_less_comparator(s);
|
||||||
|
boost::sort(keyspaces, cmp);
|
||||||
|
return boost::copy_range<std::vector<sstring>>(
|
||||||
|
range.slice(keyspaces, std::move(cmp)) | boost::adaptors::filtered([&s] (const auto& ks) {
|
||||||
|
// If this is a range query, results are divided between shards by the partition key (keyspace_name).
|
||||||
|
return shard_of(dht::global_partitioner().get_token(s,
|
||||||
|
partition_key::from_single_value(s, utf8_type->decompose(ks))))
|
||||||
|
== engine().cpu_id();
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
|
||||||
|
*/
|
||||||
|
static dht::partition_range as_ring_position_range(dht::token_range& r) {
|
||||||
|
stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
|
||||||
|
if (r.start()) {
|
||||||
|
start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
|
||||||
|
}
|
||||||
|
if (r.end()) {
|
||||||
|
end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
|
||||||
|
}
|
||||||
|
return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
|
||||||
|
*/
|
||||||
|
static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
|
||||||
|
int64_t count{0};
|
||||||
|
utils::estimated_histogram hist{0};
|
||||||
|
auto from_bytes = [] (auto& b) {
|
||||||
|
return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
|
||||||
|
};
|
||||||
|
dht::token_range_vector ranges;
|
||||||
|
::compat::unwrap_into(
|
||||||
|
wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
|
||||||
|
dht::token_comparator(),
|
||||||
|
[&] (auto&& rng) { ranges.push_back(std::move(rng)); });
|
||||||
|
for (auto&& r : ranges) {
|
||||||
|
auto rp_range = as_ring_position_range(r);
|
||||||
|
for (auto&& sstable : cf.select_sstables(rp_range)) {
|
||||||
|
count += sstable->estimated_keys_for_range(r);
|
||||||
|
hist.merge(sstable->get_stats_metadata().estimated_row_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
future<std::vector<token_range>> get_local_ranges() {
|
||||||
|
auto& ss = service::get_local_storage_service();
|
||||||
|
return ss.get_local_tokens().then([&ss] (auto&& tokens) {
|
||||||
|
auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
|
||||||
|
std::vector<token_range> local_ranges;
|
||||||
|
auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
|
||||||
|
assert(b);
|
||||||
|
return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
|
||||||
|
};
|
||||||
|
// We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
|
||||||
|
// All queries will be on that table, where all entries are text and there's no notion of
|
||||||
|
// token ranges form the CQL point of view.
|
||||||
|
auto left_inf = boost::find_if(ranges, [] (auto&& r) {
|
||||||
|
return !r.start() || r.start()->value() == dht::minimum_token();
|
||||||
|
});
|
||||||
|
auto right_inf = boost::find_if(ranges, [] (auto&& r) {
|
||||||
|
return !r.end() || r.start()->value() == dht::maximum_token();
|
||||||
|
});
|
||||||
|
if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
|
||||||
|
local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
|
||||||
|
ranges.erase(left_inf);
|
||||||
|
ranges.erase(right_inf);
|
||||||
|
}
|
||||||
|
for (auto&& r : ranges) {
|
||||||
|
local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
|
||||||
|
}
|
||||||
|
boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
|
||||||
|
return utf8_type->less(tr1.start, tr2.start);
|
||||||
|
});
|
||||||
|
return local_ranges;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
size_estimates_mutation_reader::size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
|
||||||
|
: impl(schema)
|
||||||
|
, _schema(std::move(schema))
|
||||||
|
, _prange(&prange)
|
||||||
|
, _slice(slice)
|
||||||
|
, _fwd(fwd)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
future<> size_estimates_mutation_reader::get_next_partition() {
|
||||||
|
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||||
|
if (!_keyspaces) {
|
||||||
|
_keyspaces = get_keyspaces(*_schema, db, *_prange);
|
||||||
|
_current_partition = _keyspaces->begin();
|
||||||
|
}
|
||||||
|
if (_current_partition == _keyspaces->end()) {
|
||||||
|
_end_of_stream = true;
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
return get_local_ranges().then([&db, this] (auto&& ranges) {
|
||||||
|
auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
|
||||||
|
auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
|
||||||
|
++_current_partition;
|
||||||
|
std::vector<mutation> ms;
|
||||||
|
ms.emplace_back(std::move(mutations));
|
||||||
|
_partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
future<> size_estimates_mutation_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||||
|
return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
|
||||||
|
if (!_partition_reader) {
|
||||||
|
return get_next_partition();
|
||||||
|
}
|
||||||
|
return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
|
||||||
|
push_mutation_fragment(std::move(mf));
|
||||||
|
return stop_iteration(is_buffer_full());
|
||||||
|
}, timeout).then([this] {
|
||||||
|
if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
|
||||||
|
_partition_reader = stdx::nullopt;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void size_estimates_mutation_reader::next_partition() {
|
||||||
|
clear_buffer_to_next_partition();
|
||||||
|
if (is_buffer_empty()) {
|
||||||
|
_partition_reader = stdx::nullopt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
future<> size_estimates_mutation_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
|
||||||
|
clear_buffer();
|
||||||
|
_prange = ≺
|
||||||
|
_keyspaces = stdx::nullopt;
|
||||||
|
_partition_reader = stdx::nullopt;
|
||||||
|
_end_of_stream = false;
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
future<> size_estimates_mutation_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
|
||||||
|
forward_buffer_to(pr.start());
|
||||||
|
_end_of_stream = false;
|
||||||
|
if (_partition_reader) {
|
||||||
|
return _partition_reader->fast_forward_to(std::move(pr), timeout);
|
||||||
|
}
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size_estimates_mutation_reader::buffer_size() const {
|
||||||
|
if (_partition_reader) {
|
||||||
|
return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
|
||||||
|
}
|
||||||
|
return flat_mutation_reader::impl::buffer_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<db::system_keyspace::range_estimates>
|
||||||
|
size_estimates_mutation_reader::estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
|
||||||
|
// For each specified range, estimate (crudely) mean partition size and partitions count.
|
||||||
|
auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
|
||||||
|
auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
|
||||||
|
auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
|
||||||
|
return utf8_type->decompose(cf.first);
|
||||||
|
}));
|
||||||
|
boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
|
||||||
|
return utf8_type->less(n1, n2);
|
||||||
|
});
|
||||||
|
std::vector<db::system_keyspace::range_estimates> estimates;
|
||||||
|
for (auto& range : _slice.row_ranges(*_schema, pkey)) {
|
||||||
|
auto rows = boost::make_iterator_range(
|
||||||
|
virtual_row_iterator(cf_names, local_ranges),
|
||||||
|
virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
|
||||||
|
auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
|
||||||
|
for (auto&& r : rows_to_estimate) {
|
||||||
|
auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
|
||||||
|
estimates.push_back(estimate(cf, r.tokens));
|
||||||
|
if (estimates.size() >= _slice.partition_row_limit()) {
|
||||||
|
return estimates;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return estimates;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace size_estimates
|
||||||
|
|
||||||
|
} // namespace db
|
||||||
@@ -21,33 +21,19 @@
|
|||||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <boost/range/adaptor/indirected.hpp>
|
|
||||||
#include <boost/range/adaptor/map.hpp>
|
|
||||||
#include <boost/range/adaptor/transformed.hpp>
|
|
||||||
#include <boost/range/algorithm/find_if.hpp>
|
|
||||||
|
|
||||||
#include "clustering_bounds_comparator.hh"
|
|
||||||
#include "database.hh"
|
|
||||||
#include "db/system_keyspace.hh"
|
#include "db/system_keyspace.hh"
|
||||||
#include "dht/i_partitioner.hh"
|
|
||||||
#include "mutation_reader.hh"
|
#include "mutation_reader.hh"
|
||||||
#include "partition_range_compat.hh"
|
|
||||||
#include "range.hh"
|
|
||||||
#include "service/storage_service.hh"
|
|
||||||
#include "stdx.hh"
|
|
||||||
#include "mutation_fragment.hh"
|
|
||||||
#include "sstables/sstables.hh"
|
|
||||||
#include "db/timeout_clock.hh"
|
|
||||||
|
|
||||||
namespace db {
|
namespace db {
|
||||||
|
|
||||||
namespace size_estimates {
|
namespace size_estimates {
|
||||||
|
|
||||||
|
struct token_range {
|
||||||
|
bytes start;
|
||||||
|
bytes end;
|
||||||
|
};
|
||||||
|
|
||||||
class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
|
class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
|
||||||
struct token_range {
|
|
||||||
bytes start;
|
|
||||||
bytes end;
|
|
||||||
};
|
|
||||||
schema_ptr _schema;
|
schema_ptr _schema;
|
||||||
const dht::partition_range* _prange;
|
const dht::partition_range* _prange;
|
||||||
const query::partition_slice& _slice;
|
const query::partition_slice& _slice;
|
||||||
@@ -57,267 +43,18 @@ class size_estimates_mutation_reader final : public flat_mutation_reader::impl {
|
|||||||
streamed_mutation::forwarding _fwd;
|
streamed_mutation::forwarding _fwd;
|
||||||
flat_mutation_reader_opt _partition_reader;
|
flat_mutation_reader_opt _partition_reader;
|
||||||
public:
|
public:
|
||||||
size_estimates_mutation_reader(schema_ptr schema, const dht::partition_range& prange, const query::partition_slice& slice, streamed_mutation::forwarding fwd)
|
size_estimates_mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, streamed_mutation::forwarding);
|
||||||
: impl(schema)
|
|
||||||
, _schema(std::move(schema))
|
|
||||||
, _prange(&prange)
|
|
||||||
, _slice(slice)
|
|
||||||
, _fwd(fwd)
|
|
||||||
{ }
|
|
||||||
|
|
||||||
|
virtual future<> fill_buffer(db::timeout_clock::time_point) override;
|
||||||
|
virtual void next_partition() override;
|
||||||
|
virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override;
|
||||||
|
virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override;
|
||||||
|
virtual size_t buffer_size() const override;
|
||||||
private:
|
private:
|
||||||
future<> get_next_partition() {
|
future<> get_next_partition();
|
||||||
// For each specified range, estimate (crudely) mean partition size and partitions count.
|
|
||||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
|
||||||
if (!_keyspaces) {
|
|
||||||
_keyspaces = get_keyspaces(*_schema, db, *_prange);
|
|
||||||
_current_partition = _keyspaces->begin();
|
|
||||||
}
|
|
||||||
if (_current_partition == _keyspaces->end()) {
|
|
||||||
_end_of_stream = true;
|
|
||||||
return make_ready_future<>();
|
|
||||||
}
|
|
||||||
return get_local_ranges().then([&db, this] (auto&& ranges) {
|
|
||||||
auto estimates = this->estimates_for_current_keyspace(db, std::move(ranges));
|
|
||||||
auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
|
|
||||||
++_current_partition;
|
|
||||||
std::vector<mutation> ms;
|
|
||||||
ms.emplace_back(std::move(mutations));
|
|
||||||
_partition_reader = flat_mutation_reader_from_mutations(std::move(ms), _fwd);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
public:
|
|
||||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
|
||||||
return do_until([this, timeout] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
|
|
||||||
if (!_partition_reader) {
|
|
||||||
return get_next_partition();
|
|
||||||
}
|
|
||||||
return _partition_reader->consume_pausable([this] (mutation_fragment mf) {
|
|
||||||
push_mutation_fragment(std::move(mf));
|
|
||||||
return stop_iteration(is_buffer_full());
|
|
||||||
}, timeout).then([this] {
|
|
||||||
if (_partition_reader->is_end_of_stream() && _partition_reader->is_buffer_empty()) {
|
|
||||||
_partition_reader = stdx::nullopt;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
virtual void next_partition() override {
|
|
||||||
clear_buffer_to_next_partition();
|
|
||||||
if (is_buffer_empty()) {
|
|
||||||
_partition_reader = stdx::nullopt;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
|
||||||
clear_buffer();
|
|
||||||
_prange = ≺
|
|
||||||
_keyspaces = stdx::nullopt;
|
|
||||||
_partition_reader = stdx::nullopt;
|
|
||||||
_end_of_stream = false;
|
|
||||||
return make_ready_future<>();
|
|
||||||
}
|
|
||||||
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
|
|
||||||
forward_buffer_to(pr.start());
|
|
||||||
_end_of_stream = false;
|
|
||||||
if (_partition_reader) {
|
|
||||||
return _partition_reader->fast_forward_to(std::move(pr), timeout);
|
|
||||||
}
|
|
||||||
return make_ready_future<>();
|
|
||||||
}
|
|
||||||
virtual size_t buffer_size() const override {
|
|
||||||
if (_partition_reader) {
|
|
||||||
return flat_mutation_reader::impl::buffer_size() + _partition_reader->buffer_size();
|
|
||||||
}
|
|
||||||
return flat_mutation_reader::impl::buffer_size();
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Returns the primary ranges for the local node.
|
|
||||||
* Used for testing as well.
|
|
||||||
*/
|
|
||||||
static future<std::vector<token_range>> get_local_ranges() {
|
|
||||||
auto& ss = service::get_local_storage_service();
|
|
||||||
return ss.get_local_tokens().then([&ss] (auto&& tokens) {
|
|
||||||
auto ranges = ss.get_token_metadata().get_primary_ranges_for(std::move(tokens));
|
|
||||||
std::vector<token_range> local_ranges;
|
|
||||||
auto to_bytes = [](const stdx::optional<dht::token_range::bound>& b) {
|
|
||||||
assert(b);
|
|
||||||
return utf8_type->decompose(dht::global_partitioner().to_sstring(b->value()));
|
|
||||||
};
|
|
||||||
// We merge the ranges to be compatible with how Cassandra shows it's size estimates table.
|
|
||||||
// All queries will be on that table, where all entries are text and there's no notion of
|
|
||||||
// token ranges form the CQL point of view.
|
|
||||||
auto left_inf = boost::find_if(ranges, [] (auto&& r) {
|
|
||||||
return !r.start() || r.start()->value() == dht::minimum_token();
|
|
||||||
});
|
|
||||||
auto right_inf = boost::find_if(ranges, [] (auto&& r) {
|
|
||||||
return !r.end() || r.start()->value() == dht::maximum_token();
|
|
||||||
});
|
|
||||||
if (left_inf != right_inf && left_inf != ranges.end() && right_inf != ranges.end()) {
|
|
||||||
local_ranges.push_back(token_range{to_bytes(right_inf->start()), to_bytes(left_inf->end())});
|
|
||||||
ranges.erase(left_inf);
|
|
||||||
ranges.erase(right_inf);
|
|
||||||
}
|
|
||||||
for (auto&& r : ranges) {
|
|
||||||
local_ranges.push_back(token_range{to_bytes(r.start()), to_bytes(r.end())});
|
|
||||||
}
|
|
||||||
boost::sort(local_ranges, [] (auto&& tr1, auto&& tr2) {
|
|
||||||
return utf8_type->less(tr1.start, tr2.start);
|
|
||||||
});
|
|
||||||
return local_ranges;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
struct virtual_row {
|
|
||||||
const bytes& cf_name;
|
|
||||||
const token_range& tokens;
|
|
||||||
clustering_key_prefix as_key() const {
|
|
||||||
return clustering_key_prefix::from_exploded(std::vector<bytes_view>{cf_name, tokens.start, tokens.end});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
struct virtual_row_comparator {
|
|
||||||
schema_ptr _schema;
|
|
||||||
virtual_row_comparator(schema_ptr schema) : _schema(schema) { }
|
|
||||||
bool operator()(const clustering_key_prefix& key1, const clustering_key_prefix& key2) {
|
|
||||||
return clustering_key_prefix::prefix_equality_less_compare(*_schema)(key1, key2);
|
|
||||||
}
|
|
||||||
bool operator()(const virtual_row& row, const clustering_key_prefix& key) {
|
|
||||||
return operator()(row.as_key(), key);
|
|
||||||
}
|
|
||||||
bool operator()(const clustering_key_prefix& key, const virtual_row& row) {
|
|
||||||
return operator()(key, row.as_key());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
class virtual_row_iterator : public std::iterator<std::input_iterator_tag, const virtual_row> {
|
|
||||||
std::reference_wrapper<const std::vector<bytes>> _cf_names;
|
|
||||||
std::reference_wrapper<const std::vector<token_range>> _ranges;
|
|
||||||
size_t _cf_names_idx = 0;
|
|
||||||
size_t _ranges_idx = 0;
|
|
||||||
public:
|
|
||||||
struct end_iterator_tag {};
|
|
||||||
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges)
|
|
||||||
: _cf_names(std::ref(cf_names))
|
|
||||||
, _ranges(std::ref(ranges))
|
|
||||||
{ }
|
|
||||||
virtual_row_iterator(const std::vector<bytes>& cf_names, const std::vector<token_range>& ranges, end_iterator_tag)
|
|
||||||
: _cf_names(std::ref(cf_names))
|
|
||||||
, _ranges(std::ref(ranges))
|
|
||||||
, _cf_names_idx(cf_names.size())
|
|
||||||
, _ranges_idx(ranges.size())
|
|
||||||
{ }
|
|
||||||
virtual_row_iterator& operator++() {
|
|
||||||
if (++_ranges_idx == _ranges.get().size() && ++_cf_names_idx < _cf_names.get().size()) {
|
|
||||||
_ranges_idx = 0;
|
|
||||||
}
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
virtual_row_iterator operator++(int) {
|
|
||||||
virtual_row_iterator i(*this);
|
|
||||||
++(*this);
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
const value_type operator*() const {
|
|
||||||
return { _cf_names.get()[_cf_names_idx], _ranges.get()[_ranges_idx] };
|
|
||||||
}
|
|
||||||
bool operator==(const virtual_row_iterator& i) const {
|
|
||||||
return _cf_names_idx == i._cf_names_idx
|
|
||||||
&& _ranges_idx == i._ranges_idx;
|
|
||||||
}
|
|
||||||
bool operator!=(const virtual_row_iterator& i) const {
|
|
||||||
return !(*this == i);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<db::system_keyspace::range_estimates>
|
std::vector<db::system_keyspace::range_estimates>
|
||||||
estimates_for_current_keyspace(const database& db, std::vector<token_range> local_ranges) const {
|
estimates_for_current_keyspace(const database&, std::vector<token_range> local_ranges) const;
|
||||||
auto pkey = partition_key::from_single_value(*_schema, utf8_type->decompose(*_current_partition));
|
|
||||||
auto cfs = db.find_keyspace(*_current_partition).metadata()->cf_meta_data();
|
|
||||||
auto cf_names = boost::copy_range<std::vector<bytes>>(cfs | boost::adaptors::transformed([] (auto&& cf) {
|
|
||||||
return utf8_type->decompose(cf.first);
|
|
||||||
}));
|
|
||||||
boost::sort(cf_names, [] (auto&& n1, auto&& n2) {
|
|
||||||
return utf8_type->less(n1, n2);
|
|
||||||
});
|
|
||||||
std::vector<db::system_keyspace::range_estimates> estimates;
|
|
||||||
for (auto& range : _slice.row_ranges(*_schema, pkey)) {
|
|
||||||
auto rows = boost::make_iterator_range(
|
|
||||||
virtual_row_iterator(cf_names, local_ranges),
|
|
||||||
virtual_row_iterator(cf_names, local_ranges, virtual_row_iterator::end_iterator_tag()));
|
|
||||||
auto rows_to_estimate = range.slice(rows, virtual_row_comparator(_schema));
|
|
||||||
for (auto&& r : rows_to_estimate) {
|
|
||||||
auto& cf = db.find_column_family(*_current_partition, utf8_type->to_string(r.cf_name));
|
|
||||||
estimates.push_back(estimate(cf, r.tokens));
|
|
||||||
if (estimates.size() >= _slice.partition_row_limit()) {
|
|
||||||
return estimates;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return estimates;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the keyspaces, ordered by name, as selected by the partition_range.
|
|
||||||
*/
|
|
||||||
static ks_range get_keyspaces(const schema& s, const database& db, dht::partition_range range) {
|
|
||||||
struct keyspace_less_comparator {
|
|
||||||
const schema& _s;
|
|
||||||
keyspace_less_comparator(const schema& s) : _s(s) { }
|
|
||||||
dht::ring_position as_ring_position(const sstring& ks) {
|
|
||||||
auto pkey = partition_key::from_single_value(_s, utf8_type->decompose(ks));
|
|
||||||
return dht::global_partitioner().decorate_key(_s, std::move(pkey));
|
|
||||||
}
|
|
||||||
bool operator()(const sstring& ks1, const sstring& ks2) {
|
|
||||||
return as_ring_position(ks1).less_compare(_s, as_ring_position(ks2));
|
|
||||||
}
|
|
||||||
bool operator()(const sstring& ks, const dht::ring_position& rp) {
|
|
||||||
return as_ring_position(ks).less_compare(_s, rp);
|
|
||||||
}
|
|
||||||
bool operator()(const dht::ring_position& rp, const sstring& ks) {
|
|
||||||
return rp.less_compare(_s, as_ring_position(ks));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
auto keyspaces = db.get_non_system_keyspaces();
|
|
||||||
auto cmp = keyspace_less_comparator(s);
|
|
||||||
boost::sort(keyspaces, cmp);
|
|
||||||
return boost::copy_range<ks_range>(range.slice(keyspaces, std::move(cmp)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Makes a wrapping range of ring_position from a nonwrapping range of token, used to select sstables.
|
|
||||||
*/
|
|
||||||
static dht::partition_range as_ring_position_range(dht::token_range& r) {
|
|
||||||
stdx::optional<range<dht::ring_position>::bound> start_bound, end_bound;
|
|
||||||
if (r.start()) {
|
|
||||||
start_bound = {{ dht::ring_position(r.start()->value(), dht::ring_position::token_bound::start), r.start()->is_inclusive() }};
|
|
||||||
}
|
|
||||||
if (r.end()) {
|
|
||||||
end_bound = {{ dht::ring_position(r.end()->value(), dht::ring_position::token_bound::end), r.end()->is_inclusive() }};
|
|
||||||
}
|
|
||||||
return dht::partition_range(std::move(start_bound), std::move(end_bound), r.is_singular());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add a new range_estimates for the specified range, considering the sstables associated with `cf`.
|
|
||||||
*/
|
|
||||||
static system_keyspace::range_estimates estimate(const column_family& cf, const token_range& r) {
|
|
||||||
int64_t count{0};
|
|
||||||
utils::estimated_histogram hist{0};
|
|
||||||
auto from_bytes = [] (auto& b) {
|
|
||||||
return dht::global_partitioner().from_sstring(utf8_type->to_string(b));
|
|
||||||
};
|
|
||||||
dht::token_range_vector ranges;
|
|
||||||
::compat::unwrap_into(
|
|
||||||
wrapping_range<dht::token>({{ from_bytes(r.start), false }}, {{ from_bytes(r.end) }}),
|
|
||||||
dht::token_comparator(),
|
|
||||||
[&] (auto&& rng) { ranges.push_back(std::move(rng)); });
|
|
||||||
for (auto&& r : ranges) {
|
|
||||||
auto rp_range = as_ring_position_range(r);
|
|
||||||
for (auto&& sstable : cf.select_sstables(rp_range)) {
|
|
||||||
count += sstable->estimated_keys_for_range(r);
|
|
||||||
hist.merge(sstable->get_stats_metadata().estimated_row_size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return {cf.schema(), r.start, r.end, count, count > 0 ? hist.mean() : 0};
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct virtual_reader {
|
struct virtual_reader {
|
||||||
@@ -332,6 +69,12 @@ struct virtual_reader {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the primary ranges for the local node.
|
||||||
|
* Used for testing as well.
|
||||||
|
*/
|
||||||
|
future<std::vector<token_range>> get_local_ranges();
|
||||||
|
|
||||||
} // namespace size_estimates
|
} // namespace size_estimates
|
||||||
|
|
||||||
} // namespace db
|
} // namespace db
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ future<> system_distributed_keyspace::start() {
|
|||||||
return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
|
return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
|
||||||
return do_for_each(tables, [this] (schema_ptr table) {
|
return do_for_each(tables, [this] (schema_ptr table) {
|
||||||
return ignore_existing([this, table = std::move(table)] {
|
return ignore_existing([this, table = std::move(table)] {
|
||||||
return _mm.announce_new_column_family(std::move(table), false);
|
return _mm.announce_new_column_family(std::move(table), api::min_timestamp, false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -28,5 +28,6 @@
|
|||||||
namespace db {
|
namespace db {
|
||||||
using timeout_clock = seastar::lowres_clock;
|
using timeout_clock = seastar::lowres_clock;
|
||||||
using timeout_semaphore = seastar::basic_semaphore<seastar::default_timeout_exception_factory, timeout_clock>;
|
using timeout_semaphore = seastar::basic_semaphore<seastar::default_timeout_exception_factory, timeout_clock>;
|
||||||
|
using timeout_semaphore_units = seastar::semaphore_units<seastar::default_timeout_exception_factory, timeout_clock>;
|
||||||
static constexpr timeout_clock::time_point no_timeout = timeout_clock::time_point::max();
|
static constexpr timeout_clock::time_point no_timeout = timeout_clock::time_point::max();
|
||||||
}
|
}
|
||||||
|
|||||||
70
db/view/node_view_update_backlog.hh
Normal file
70
db/view/node_view_update_backlog.hh
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2018 ScyllaDB
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Scylla.
|
||||||
|
*
|
||||||
|
* Scylla is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Scylla is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "db/view/view_update_backlog.hh"
|
||||||
|
|
||||||
|
#include <seastar/core/cacheline.hh>
|
||||||
|
#include <seastar/core/lowres_clock.hh>
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <chrono>
|
||||||
|
#include <new>
|
||||||
|
|
||||||
|
namespace db::view {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An atomic view update backlog representation, safe to update from multiple shards.
|
||||||
|
* It is legal for a stale current max value to be returned.
|
||||||
|
*/
|
||||||
|
class node_update_backlog {
|
||||||
|
using clock = seastar::lowres_clock;
|
||||||
|
struct per_shard_backlog {
|
||||||
|
// Multiply by 2 to defeat the prefetcher
|
||||||
|
alignas(seastar::cache_line_size * 2) std::atomic<update_backlog> backlog = update_backlog::no_backlog();
|
||||||
|
|
||||||
|
update_backlog load() const {
|
||||||
|
return backlog.load(std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
std::vector<per_shard_backlog> _backlogs;
|
||||||
|
std::chrono::milliseconds _interval;
|
||||||
|
std::atomic<clock::time_point> _last_update;
|
||||||
|
std::atomic<update_backlog> _max;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
|
||||||
|
: _backlogs(shards)
|
||||||
|
, _interval(interval)
|
||||||
|
, _last_update(clock::now() - _interval)
|
||||||
|
, _max(update_backlog::no_backlog()) {
|
||||||
|
}
|
||||||
|
|
||||||
|
update_backlog add_fetch(unsigned shard, update_backlog backlog);
|
||||||
|
|
||||||
|
// Exposed for testing only.
|
||||||
|
update_backlog load() const {
|
||||||
|
return _max.load(std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
153
db/view/view.cc
153
db/view/view.cc
@@ -58,6 +58,7 @@
|
|||||||
#include "cql3/util.hh"
|
#include "cql3/util.hh"
|
||||||
#include "db/view/view.hh"
|
#include "db/view/view.hh"
|
||||||
#include "db/view/view_builder.hh"
|
#include "db/view/view_builder.hh"
|
||||||
|
#include "frozen_mutation.hh"
|
||||||
#include "gms/inet_address.hh"
|
#include "gms/inet_address.hh"
|
||||||
#include "keys.hh"
|
#include "keys.hh"
|
||||||
#include "locator/network_topology_strategy.hh"
|
#include "locator/network_topology_strategy.hh"
|
||||||
@@ -226,10 +227,11 @@ public:
|
|||||||
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
|
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void move_to(std::vector<mutation>& mutations) && {
|
void move_to(std::vector<frozen_mutation_and_schema>& mutations) && {
|
||||||
auto& partitioner = dht::global_partitioner();
|
auto& partitioner = dht::global_partitioner();
|
||||||
std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
|
std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
|
||||||
return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
|
auto mut = mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
|
||||||
|
return frozen_mutation_and_schema{freeze(mut), std::move(_view)};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -443,7 +445,7 @@ void create_virtual_column(schema_builder& builder, const bytes& name, const dat
|
|||||||
// A map has keys and values. We don't need these values,
|
// A map has keys and values. We don't need these values,
|
||||||
// and can use empty values instead.
|
// and can use empty values instead.
|
||||||
auto mtype = dynamic_pointer_cast<const map_type_impl>(type);
|
auto mtype = dynamic_pointer_cast<const map_type_impl>(type);
|
||||||
builder.with_column(name, map_type_impl::get_instance(mtype->get_values_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
|
builder.with_column(name, map_type_impl::get_instance(mtype->get_keys_type(), empty_type, true), column_kind::regular_column, column_view_virtual::yes);
|
||||||
} else if (ctype->is_set()) {
|
} else if (ctype->is_set()) {
|
||||||
// A set's cell has nothing beyond the keys, so the
|
// A set's cell has nothing beyond the keys, so the
|
||||||
// virtual version of a set is, unfortunately, a complete
|
// virtual version of a set is, unfortunately, a complete
|
||||||
@@ -627,7 +629,7 @@ public:
|
|||||||
, _now(gc_clock::now()) {
|
, _now(gc_clock::now()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
future<std::vector<mutation>> build();
|
future<std::vector<frozen_mutation_and_schema>> build();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void generate_update(clustering_row&& update, stdx::optional<clustering_row>&& existing);
|
void generate_update(clustering_row&& update, stdx::optional<clustering_row>&& existing);
|
||||||
@@ -664,7 +666,7 @@ private:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
future<std::vector<mutation>> view_update_builder::build() {
|
future<std::vector<frozen_mutation_and_schema>> view_update_builder::build() {
|
||||||
return advance_all().then([this] (auto&& ignored) {
|
return advance_all().then([this] (auto&& ignored) {
|
||||||
assert(_update && _update->is_partition_start());
|
assert(_update && _update->is_partition_start());
|
||||||
_key = std::move(std::move(_update)->as_partition_start().key().key());
|
_key = std::move(std::move(_update)->as_partition_start().key().key());
|
||||||
@@ -679,7 +681,7 @@ future<std::vector<mutation>> view_update_builder::build() {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
}).then([this] {
|
}).then([this] {
|
||||||
std::vector<mutation> mutations;
|
std::vector<frozen_mutation_and_schema> mutations;
|
||||||
for (auto&& update : _view_updates) {
|
for (auto&& update : _view_updates) {
|
||||||
std::move(update).move_to(mutations);
|
std::move(update).move_to(mutations);
|
||||||
}
|
}
|
||||||
@@ -779,6 +781,7 @@ future<stop_iteration> view_update_builder::on_results() {
|
|||||||
// If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
|
// If we have updates and it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it
|
||||||
if (_update && !_update->is_end_of_partition()) {
|
if (_update && !_update->is_end_of_partition()) {
|
||||||
if (_update->is_clustering_row()) {
|
if (_update->is_clustering_row()) {
|
||||||
|
apply_tracked_tombstones(_update_tombstone_tracker, _update->as_mutable_clustering_row());
|
||||||
generate_update(std::move(*_update).as_clustering_row(), { });
|
generate_update(std::move(*_update).as_clustering_row(), { });
|
||||||
}
|
}
|
||||||
return advance_updates();
|
return advance_updates();
|
||||||
@@ -787,7 +790,7 @@ future<stop_iteration> view_update_builder::on_results() {
|
|||||||
return stop();
|
return stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
future<std::vector<mutation>> generate_view_updates(
|
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||||
const schema_ptr& base,
|
const schema_ptr& base,
|
||||||
std::vector<view_ptr>&& views_to_update,
|
std::vector<view_ptr>&& views_to_update,
|
||||||
flat_mutation_reader&& updates,
|
flat_mutation_reader&& updates,
|
||||||
@@ -924,16 +927,35 @@ get_view_natural_endpoint(const sstring& keyspace_name,
|
|||||||
// to a modification of a single base partition, and apply them to the
|
// to a modification of a single base partition, and apply them to the
|
||||||
// appropriate paired replicas. This is done asynchronously - we do not wait
|
// appropriate paired replicas. This is done asynchronously - we do not wait
|
||||||
// for the writes to complete.
|
// for the writes to complete.
|
||||||
// FIXME: I dropped a lot of parameters the Cassandra version had,
|
future<> mutate_MV(
|
||||||
// we may need them back: writeCommitLog, baseComplete, queryStartNanoTime.
|
const dht::token& base_token,
|
||||||
future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats)
|
std::vector<frozen_mutation_and_schema> view_updates,
|
||||||
|
db::view::stats& stats,
|
||||||
|
db::timeout_semaphore_units pending_view_updates)
|
||||||
{
|
{
|
||||||
auto fs = std::make_unique<std::vector<future<>>>();
|
auto fs = std::make_unique<std::vector<future<>>>();
|
||||||
for (auto& mut : mutations) {
|
fs->reserve(view_updates.size());
|
||||||
auto view_token = mut.token();
|
auto& partitioner = dht::global_partitioner();
|
||||||
auto keyspace_name = mut.schema()->ks_name();
|
for (frozen_mutation_and_schema& mut : view_updates) {
|
||||||
|
auto view_token = partitioner.get_token(*mut.s, mut.fm.key(*mut.s));
|
||||||
|
auto& keyspace_name = mut.s->ks_name();
|
||||||
auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
|
auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
|
||||||
auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
|
auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
|
||||||
|
auto maybe_account_failure = [&stats, units = pending_view_updates.split(mut.fm.representation().size())] (
|
||||||
|
future<>&& f,
|
||||||
|
gms::inet_address target,
|
||||||
|
bool is_local,
|
||||||
|
size_t remotes) {
|
||||||
|
if (f.failed()) {
|
||||||
|
stats.view_updates_failed_local += is_local;
|
||||||
|
stats.view_updates_failed_remote += remotes;
|
||||||
|
auto ep = f.get_exception();
|
||||||
|
vlogger.error("Error applying view update to {}: {}", target, ep);
|
||||||
|
return make_exception_future<>(std::move(ep));
|
||||||
|
} else {
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
};
|
||||||
if (paired_endpoint) {
|
if (paired_endpoint) {
|
||||||
// When paired endpoint is the local node, we can just apply
|
// When paired endpoint is the local node, we can just apply
|
||||||
// the mutation locally, unless there are pending endpoints, in
|
// the mutation locally, unless there are pending endpoints, in
|
||||||
@@ -951,10 +973,16 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
|||||||
// do not wait for it to complete.
|
// do not wait for it to complete.
|
||||||
// Note also that mutate_locally(mut) copies mut (in
|
// Note also that mutate_locally(mut) copies mut (in
|
||||||
// frozen form) so don't need to increase its lifetime.
|
// frozen form) so don't need to increase its lifetime.
|
||||||
fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).handle_exception([&stats] (auto ep) {
|
// send_to_endpoint() below updates statistics on pending
|
||||||
vlogger.error("Error applying local view update: {}", ep);
|
// writes but mutate_locally() doesn't, so we need to do that here.
|
||||||
stats.view_updates_failed_local++;
|
++stats.writes;
|
||||||
return make_exception_future<>(std::move(ep));
|
auto mut_ptr = std::make_unique<frozen_mutation>(std::move(mut.fm));
|
||||||
|
fs->push_back(service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr).then_wrapped(
|
||||||
|
[&stats,
|
||||||
|
maybe_account_failure = std::move(maybe_account_failure),
|
||||||
|
mut_ptr = std::move(mut_ptr)] (future<>&& f) {
|
||||||
|
--stats.writes;
|
||||||
|
return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
|
||||||
}));
|
}));
|
||||||
} else {
|
} else {
|
||||||
vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
|
vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
|
||||||
@@ -965,14 +993,17 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
|||||||
// to send the update there. Currently, we do this from *each* of
|
// to send the update there. Currently, we do this from *each* of
|
||||||
// the base replicas, but this is probably excessive - see
|
// the base replicas, but this is probably excessive - see
|
||||||
// See https://issues.apache.org/jira/browse/CASSANDRA-14262/
|
// See https://issues.apache.org/jira/browse/CASSANDRA-14262/
|
||||||
fs->push_back(service::get_local_storage_proxy().send_to_endpoint(std::move(mut), *paired_endpoint, std::move(pending_endpoints), db::write_type::VIEW, stats)
|
fs->push_back(service::get_local_storage_proxy().send_to_endpoint(
|
||||||
.handle_exception([paired_endpoint, is_endpoint_local, updates_pushed_remote, &stats] (auto ep) {
|
std::move(mut),
|
||||||
stats.view_updates_failed_local += is_endpoint_local;
|
*paired_endpoint,
|
||||||
stats.view_updates_failed_remote += updates_pushed_remote;
|
std::move(pending_endpoints),
|
||||||
vlogger.error("Error applying view update to {}: {}", *paired_endpoint, ep);
|
db::write_type::VIEW, stats).then_wrapped(
|
||||||
return make_exception_future<>(std::move(ep));
|
[paired_endpoint,
|
||||||
})
|
is_endpoint_local,
|
||||||
);
|
updates_pushed_remote,
|
||||||
|
maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
|
||||||
|
return maybe_account_failure(std::move(f), std::move(*paired_endpoint), is_endpoint_local, updates_pushed_remote);
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
} else if (!pending_endpoints.empty()) {
|
} else if (!pending_endpoints.empty()) {
|
||||||
// If there is no paired endpoint, it means there's a range movement going on (decommission or move),
|
// If there is no paired endpoint, it means there's a range movement going on (decommission or move),
|
||||||
@@ -992,10 +1023,11 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
|||||||
std::move(mut),
|
std::move(mut),
|
||||||
target,
|
target,
|
||||||
std::move(pending_endpoints),
|
std::move(pending_endpoints),
|
||||||
db::write_type::VIEW).handle_exception([target, updates_pushed_remote, &stats] (auto ep) {
|
db::write_type::VIEW).then_wrapped(
|
||||||
stats.view_updates_failed_remote += updates_pushed_remote;
|
[target,
|
||||||
vlogger.error("Error applying view update to {}: {}", target, ep);
|
updates_pushed_remote,
|
||||||
return make_exception_future<>(std::move(ep));
|
maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) {
|
||||||
|
return maybe_account_failure(std::move(f), std::move(target), false, updates_pushed_remote);
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1226,6 +1258,20 @@ future<> view_builder::calculate_shard_build_step(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// All shards need to arrive at the same decisions on whether or not to
|
||||||
|
// restart a view build at some common token (reshard), and which token
|
||||||
|
// to restart at. So we need to wait until all shards have read the view
|
||||||
|
// build statuses before they can all proceed to make the (same) decision.
|
||||||
|
// If we don't synchronoize here, a fast shard may make a decision, start
|
||||||
|
// building and finish a build step - before the slowest shard even read
|
||||||
|
// the view build information.
|
||||||
|
container().invoke_on(0, [] (view_builder& builder) {
|
||||||
|
if (++builder._shards_finished_read == smp::count) {
|
||||||
|
builder._shards_finished_read_promise.set_value();
|
||||||
|
}
|
||||||
|
return builder._shards_finished_read_promise.get_shared_future();
|
||||||
|
}).get();
|
||||||
|
|
||||||
std::unordered_set<utils::UUID> loaded_views;
|
std::unordered_set<utils::UUID> loaded_views;
|
||||||
if (view_build_status_per_shard.size() != smp::count) {
|
if (view_build_status_per_shard.size() != smp::count) {
|
||||||
reshard(std::move(view_build_status_per_shard), loaded_views);
|
reshard(std::move(view_build_status_per_shard), loaded_views);
|
||||||
@@ -1419,7 +1465,16 @@ private:
|
|||||||
built_views _built_views;
|
built_views _built_views;
|
||||||
std::vector<view_ptr> _views_to_build;
|
std::vector<view_ptr> _views_to_build;
|
||||||
std::deque<mutation_fragment> _fragments;
|
std::deque<mutation_fragment> _fragments;
|
||||||
|
// The compact_for_query<> that feeds this consumer is already configured
|
||||||
|
// to feed us up to view_builder::batchsize (128) rows and not an entire
|
||||||
|
// partition. Still, if rows contain large blobs, saving 128 of them in
|
||||||
|
// _fragments may be too much. So we want to track _fragment's memory
|
||||||
|
// usage, and flush the _fragments if it has grown too large.
|
||||||
|
// Additionally, limiting _fragment's size also solves issue #4213:
|
||||||
|
// A single view mutation can be as large as the size of the base rows
|
||||||
|
// used to build it, and we cannot allow its serialized size to grow
|
||||||
|
// beyond our limit on mutation size (by default 32 MB).
|
||||||
|
size_t _fragments_memory_usage = 0;
|
||||||
public:
|
public:
|
||||||
consumer(view_builder& builder, build_step& step)
|
consumer(view_builder& builder, build_step& step)
|
||||||
: _builder(builder)
|
: _builder(builder)
|
||||||
@@ -1482,7 +1537,15 @@ public:
|
|||||||
return stop_iteration::yes;
|
return stop_iteration::yes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_fragments_memory_usage += cr.memory_usage(*_step.base->schema());
|
||||||
_fragments.push_back(std::move(cr));
|
_fragments.push_back(std::move(cr));
|
||||||
|
if (_fragments_memory_usage > 1024*1024) {
|
||||||
|
// Although we have not yet completed the batch of base rows that
|
||||||
|
// compact_for_query<> planned for us (view_builder::batchsize),
|
||||||
|
// we've still collected enough rows to reach sizeable memory use,
|
||||||
|
// so let's flush these rows now.
|
||||||
|
flush_fragments();
|
||||||
|
}
|
||||||
return stop_iteration::no;
|
return stop_iteration::no;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1490,7 +1553,7 @@ public:
|
|||||||
return stop_iteration::no;
|
return stop_iteration::no;
|
||||||
}
|
}
|
||||||
|
|
||||||
stop_iteration consume_end_of_partition() {
|
void flush_fragments() {
|
||||||
_builder._as.check();
|
_builder._as.check();
|
||||||
if (!_fragments.empty()) {
|
if (!_fragments.empty()) {
|
||||||
_fragments.push_front(partition_start(_step.current_key, tombstone()));
|
_fragments.push_front(partition_start(_step.current_key, tombstone()));
|
||||||
@@ -1499,7 +1562,12 @@ public:
|
|||||||
_step.current_token(),
|
_step.current_token(),
|
||||||
make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
|
make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
|
||||||
_fragments.clear();
|
_fragments.clear();
|
||||||
|
_fragments_memory_usage = 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_iteration consume_end_of_partition() {
|
||||||
|
flush_fragments();
|
||||||
return stop_iteration(_step.build_status.empty());
|
return stop_iteration(_step.build_status.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1591,12 +1659,29 @@ future<> view_builder::maybe_mark_view_as_built(view_ptr view, dht::token next_t
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout) {
|
future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name) {
|
||||||
return container().invoke_on(0, [ks_name, view_name, timeout] (view_builder& builder) {
|
return container().invoke_on(0, [ks_name, view_name] (view_builder& builder) {
|
||||||
auto v = std::pair(std::move(ks_name), std::move(view_name));
|
auto v = std::pair(std::move(ks_name), std::move(view_name));
|
||||||
return builder._build_notifiers[std::move(v)].get_shared_future(timeout);
|
return builder._build_notifiers[std::move(v)].get_shared_future();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog backlog) {
|
||||||
|
_backlogs[shard].backlog.store(backlog, std::memory_order_relaxed);
|
||||||
|
auto now = clock::now();
|
||||||
|
if (now >= _last_update.load(std::memory_order_relaxed) + _interval) {
|
||||||
|
_last_update.store(now, std::memory_order_relaxed);
|
||||||
|
auto new_max = boost::accumulate(
|
||||||
|
_backlogs,
|
||||||
|
update_backlog::no_backlog(),
|
||||||
|
[] (const update_backlog& lhs, const per_shard_backlog& rhs) {
|
||||||
|
return std::max(lhs, rhs.load());
|
||||||
|
});
|
||||||
|
_max.store(new_max, std::memory_order_relaxed);
|
||||||
|
return new_max;
|
||||||
|
}
|
||||||
|
return std::max(backlog, _max.load(std::memory_order_relaxed));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace view
|
} // namespace view
|
||||||
} // namespace db
|
} // namespace db
|
||||||
|
|||||||
@@ -30,6 +30,10 @@
|
|||||||
#include "flat_mutation_reader.hh"
|
#include "flat_mutation_reader.hh"
|
||||||
#include "stdx.hh"
|
#include "stdx.hh"
|
||||||
|
|
||||||
|
#include <seastar/core/semaphore.hh>
|
||||||
|
|
||||||
|
class frozen_mutation_and_schema;
|
||||||
|
|
||||||
namespace db {
|
namespace db {
|
||||||
|
|
||||||
namespace view {
|
namespace view {
|
||||||
@@ -90,7 +94,7 @@ bool matches_view_filter(const schema& base, const view_info& view, const partit
|
|||||||
|
|
||||||
bool clustering_prefix_matches(const schema& base, const partition_key& key, const clustering_key_prefix& ck);
|
bool clustering_prefix_matches(const schema& base, const partition_key& key, const clustering_key_prefix& ck);
|
||||||
|
|
||||||
future<std::vector<mutation>> generate_view_updates(
|
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||||
const schema_ptr& base,
|
const schema_ptr& base,
|
||||||
std::vector<view_ptr>&& views_to_update,
|
std::vector<view_ptr>&& views_to_update,
|
||||||
flat_mutation_reader&& updates,
|
flat_mutation_reader&& updates,
|
||||||
@@ -102,7 +106,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
|
|||||||
const mutation_partition& mp,
|
const mutation_partition& mp,
|
||||||
const std::vector<view_ptr>& views);
|
const std::vector<view_ptr>& views);
|
||||||
|
|
||||||
future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats);
|
future<> mutate_MV(
|
||||||
|
const dht::token& base_token,
|
||||||
|
std::vector<frozen_mutation_and_schema> view_updates,
|
||||||
|
db::view::stats& stats,
|
||||||
|
db::timeout_semaphore_units pending_view_updates);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create_virtual_column() adds a "virtual column" to a schema builder.
|
* create_virtual_column() adds a "virtual column" to a schema builder.
|
||||||
|
|||||||
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
|
|||||||
future<> _started = make_ready_future<>();
|
future<> _started = make_ready_future<>();
|
||||||
// Used to coordinate between shards the conclusion of the build process for a particular view.
|
// Used to coordinate between shards the conclusion of the build process for a particular view.
|
||||||
std::unordered_set<utils::UUID> _built_views;
|
std::unordered_set<utils::UUID> _built_views;
|
||||||
|
// Counter and promise (both on shard 0 only!) allowing to wait for all
|
||||||
|
// shards to have read the view build statuses
|
||||||
|
unsigned _shards_finished_read = 0;
|
||||||
|
seastar::shared_promise<> _shards_finished_read_promise;
|
||||||
// Used for testing.
|
// Used for testing.
|
||||||
std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;
|
std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;
|
||||||
|
|
||||||
@@ -178,7 +182,7 @@ public:
|
|||||||
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;
|
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;
|
||||||
|
|
||||||
// For tests
|
// For tests
|
||||||
future<> wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout);
|
future<> wait_until_built(const sstring& ks_name, const sstring& view_name);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
build_step& get_or_create_build_step(utils::UUID);
|
build_step& get_or_create_build_step(utils::UUID);
|
||||||
|
|||||||
73
db/view/view_update_backlog.hh
Normal file
73
db/view/view_update_backlog.hh
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2018 ScyllaDB
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Scylla.
|
||||||
|
*
|
||||||
|
* Scylla is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Scylla is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
namespace db::view {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The view update backlog represents the pending view data that a base replica
|
||||||
|
* maintains. It is the maximum of the memory backlog - how much memory pending
|
||||||
|
* view updates are consuming out of the their allocated quota - and the disk
|
||||||
|
* backlog - how much view hints are consuming. The size of a backlog is relative
|
||||||
|
* to its maximum size.
|
||||||
|
*/
|
||||||
|
struct update_backlog {
|
||||||
|
size_t current;
|
||||||
|
size_t max;
|
||||||
|
|
||||||
|
float relative_size() const {
|
||||||
|
return float(current) / float(max);
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator==(const update_backlog& lhs, const update_backlog& rhs) {
|
||||||
|
return lhs.relative_size() == rhs.relative_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator<(const update_backlog& lhs, const update_backlog& rhs) {
|
||||||
|
return lhs.relative_size() < rhs.relative_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator!=(const update_backlog& lhs, const update_backlog& rhs) {
|
||||||
|
return !(lhs == rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator<=(const update_backlog& lhs, const update_backlog& rhs) {
|
||||||
|
return !(rhs < lhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator>(const update_backlog& lhs, const update_backlog& rhs) {
|
||||||
|
return rhs < lhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
friend bool operator>=(const update_backlog& lhs, const update_backlog& rhs) {
|
||||||
|
return !(lhs < rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
static update_backlog no_backlog() {
|
||||||
|
return update_backlog{0, std::numeric_limits<size_t>::max()};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
68
db/view/view_update_from_staging_generator.cc
Normal file
68
db/view/view_update_from_staging_generator.cc
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2018 ScyllaDB
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Scylla.
|
||||||
|
*
|
||||||
|
* Scylla is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Scylla is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "view_update_from_staging_generator.hh"
|
||||||
|
|
||||||
|
namespace db::view {
|
||||||
|
|
||||||
|
future<> view_update_from_staging_generator::start() {
|
||||||
|
thread_attributes attr;
|
||||||
|
attr.sched_group = _db.get_streaming_scheduling_group();
|
||||||
|
_started = seastar::async(std::move(attr), [this]() mutable {
|
||||||
|
while (!_as.abort_requested()) {
|
||||||
|
if (_sstables_with_tables.empty()) {
|
||||||
|
_pending_sstables.wait().get();
|
||||||
|
}
|
||||||
|
while (!_sstables_with_tables.empty()) {
|
||||||
|
auto& entry = _sstables_with_tables.front();
|
||||||
|
schema_ptr s = entry.t->schema();
|
||||||
|
flat_mutation_reader staging_sstable_reader = entry.sst->read_rows_flat(s);
|
||||||
|
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, _proxy, entry.sst, _as), db::no_timeout);
|
||||||
|
if (result == stop_iteration::yes) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
entry.t->move_sstable_from_staging_in_thread(entry.sst);
|
||||||
|
_registration_sem.signal();
|
||||||
|
_sstables_with_tables.pop_front();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
future<> view_update_from_staging_generator::stop() {
|
||||||
|
_as.request_abort();
|
||||||
|
_pending_sstables.signal();
|
||||||
|
return std::move(_started).then([this] {
|
||||||
|
_registration_sem.broken();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
future<> view_update_from_staging_generator::register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table) {
|
||||||
|
if (_as.abort_requested()) {
|
||||||
|
return make_ready_future<>();
|
||||||
|
}
|
||||||
|
_sstables_with_tables.emplace_back(std::move(sst), std::move(table));
|
||||||
|
_pending_sstables.signal();
|
||||||
|
return _registration_sem.wait(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
56
db/view/view_update_from_staging_generator.hh
Normal file
56
db/view/view_update_from_staging_generator.hh
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2018 ScyllaDB
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Scylla.
|
||||||
|
*
|
||||||
|
* Scylla is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Scylla is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "database.hh"
|
||||||
|
#include "sstables/sstables.hh"
|
||||||
|
#include "db/view/view_updating_consumer.hh"
|
||||||
|
|
||||||
|
#include <seastar/core/abort_source.hh>
|
||||||
|
#include <seastar/core/condition-variable.hh>
|
||||||
|
#include <seastar/core/semaphore.hh>
|
||||||
|
|
||||||
|
namespace db::view {
|
||||||
|
|
||||||
|
class view_update_from_staging_generator {
|
||||||
|
static constexpr size_t registration_queue_size = 5;
|
||||||
|
database& _db;
|
||||||
|
service::storage_proxy& _proxy;
|
||||||
|
seastar::abort_source _as;
|
||||||
|
future<> _started = make_ready_future<>();
|
||||||
|
seastar::condition_variable _pending_sstables;
|
||||||
|
semaphore _registration_sem{registration_queue_size};
|
||||||
|
struct sstable_with_table {
|
||||||
|
sstables::shared_sstable sst;
|
||||||
|
lw_shared_ptr<table> t;
|
||||||
|
sstable_with_table(sstables::shared_sstable sst, lw_shared_ptr<table> t) : sst(sst), t(t) { }
|
||||||
|
};
|
||||||
|
std::deque<sstable_with_table> _sstables_with_tables;
|
||||||
|
public:
|
||||||
|
view_update_from_staging_generator(database& db, service::storage_proxy& proxy) : _db(db), _proxy(proxy) { }
|
||||||
|
|
||||||
|
future<> start();
|
||||||
|
future<> stop();
|
||||||
|
future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
92
db/view/view_updating_consumer.hh
Normal file
92
db/view/view_updating_consumer.hh
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2018 ScyllaDB
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Scylla.
|
||||||
|
*
|
||||||
|
* Scylla is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Scylla is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "service/storage_proxy.hh"
|
||||||
|
#include "dht/i_partitioner.hh"
|
||||||
|
#include "schema.hh"
|
||||||
|
#include "mutation_fragment.hh"
|
||||||
|
#include "sstables/shared_sstable.hh"
|
||||||
|
|
||||||
|
namespace db::view {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A consumer that pushes materialized view updates for each consumed mutation.
|
||||||
|
* It is expected to be run in seastar::async threaded context through consume_in_thread()
|
||||||
|
*/
|
||||||
|
class view_updating_consumer {
|
||||||
|
schema_ptr _schema;
|
||||||
|
lw_shared_ptr<table> _table;
|
||||||
|
sstables::shared_sstable _excluded_sstable;
|
||||||
|
const seastar::abort_source& _as;
|
||||||
|
std::optional<mutation> _m;
|
||||||
|
public:
|
||||||
|
view_updating_consumer(schema_ptr schema, service::storage_proxy& proxy, sstables::shared_sstable excluded_sstable, const seastar::abort_source& as)
|
||||||
|
: _schema(std::move(schema))
|
||||||
|
, _table(proxy.get_db().local().find_column_family(_schema->id()).shared_from_this())
|
||||||
|
, _excluded_sstable(excluded_sstable)
|
||||||
|
, _as(as)
|
||||||
|
, _m()
|
||||||
|
{ }
|
||||||
|
|
||||||
|
void consume_new_partition(const dht::decorated_key& dk) {
|
||||||
|
_m = mutation(_schema, dk, mutation_partition(_schema));
|
||||||
|
}
|
||||||
|
|
||||||
|
void consume(tombstone t) {
|
||||||
|
_m->partition().apply(std::move(t));
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_iteration consume(static_row&& sr) {
|
||||||
|
if (_as.abort_requested()) {
|
||||||
|
return stop_iteration::yes;
|
||||||
|
}
|
||||||
|
_m->partition().apply(*_schema, std::move(sr));
|
||||||
|
return stop_iteration::no;
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_iteration consume(clustering_row&& cr) {
|
||||||
|
if (_as.abort_requested()) {
|
||||||
|
return stop_iteration::yes;
|
||||||
|
}
|
||||||
|
_m->partition().apply(*_schema, std::move(cr));
|
||||||
|
return stop_iteration::no;
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_iteration consume(range_tombstone&& rt) {
|
||||||
|
if (_as.abort_requested()) {
|
||||||
|
return stop_iteration::yes;
|
||||||
|
}
|
||||||
|
_m->partition().apply(*_schema, std::move(rt));
|
||||||
|
return stop_iteration::no;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expected to be run in seastar::async threaded context (consume_in_thread())
|
||||||
|
stop_iteration consume_end_of_partition();
|
||||||
|
|
||||||
|
stop_iteration consume_end_of_stream() {
|
||||||
|
return stop_iteration(_as.abort_requested());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@@ -49,22 +49,24 @@ namespace dht {
|
|||||||
future<> boot_strapper::bootstrap() {
|
future<> boot_strapper::bootstrap() {
|
||||||
blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());
|
blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());
|
||||||
|
|
||||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap");
|
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
|
||||||
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
|
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
|
||||||
for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
|
auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
|
||||||
|
return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
|
||||||
auto& ks = _db.local().find_keyspace(keyspace_name);
|
auto& ks = _db.local().find_keyspace(keyspace_name);
|
||||||
auto& strategy = ks.get_replication_strategy();
|
auto& strategy = ks.get_replication_strategy();
|
||||||
dht::token_range_vector ranges = strategy.get_pending_address_ranges(_token_metadata, _tokens, _address);
|
dht::token_range_vector ranges = strategy.get_pending_address_ranges(_token_metadata, _tokens, _address);
|
||||||
blogger.debug("Will stream keyspace={}, ranges={}", keyspace_name, ranges);
|
blogger.debug("Will stream keyspace={}, ranges={}", keyspace_name, ranges);
|
||||||
streamer->add_ranges(keyspace_name, ranges);
|
return streamer->add_ranges(keyspace_name, ranges);
|
||||||
}
|
}).then([this, streamer] {
|
||||||
|
return streamer->stream_async().then([streamer] () {
|
||||||
return streamer->stream_async().then([streamer] () {
|
service::get_local_storage_service().finish_bootstrapping();
|
||||||
service::get_local_storage_service().finish_bootstrapping();
|
}).handle_exception([streamer] (std::exception_ptr eptr) {
|
||||||
}).handle_exception([streamer] (std::exception_ptr eptr) {
|
blogger.warn("Error during bootstrap: {}", eptr);
|
||||||
blogger.warn("Error during bootstrap: {}", eptr);
|
return make_exception_future<>(std::move(eptr));
|
||||||
return make_exception_future<>(std::move(eptr));
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata metadata, database& db) {
|
std::unordered_set<token> boot_strapper::get_bootstrap_tokens(token_metadata metadata, database& db) {
|
||||||
|
|||||||
@@ -114,6 +114,9 @@ range_streamer::get_all_ranges_with_sources_for(const sstring& keyspace_name, dh
|
|||||||
for (auto& desired_range : desired_ranges) {
|
for (auto& desired_range : desired_ranges) {
|
||||||
auto found = false;
|
auto found = false;
|
||||||
for (auto& x : range_addresses) {
|
for (auto& x : range_addresses) {
|
||||||
|
if (need_preempt()) {
|
||||||
|
seastar::thread::yield();
|
||||||
|
}
|
||||||
const range<token>& src_range = x.first;
|
const range<token>& src_range = x.first;
|
||||||
if (src_range.contains(desired_range, dht::tri_compare)) {
|
if (src_range.contains(desired_range, dht::tri_compare)) {
|
||||||
std::vector<inet_address>& addresses = x.second;
|
std::vector<inet_address>& addresses = x.second;
|
||||||
@@ -157,6 +160,9 @@ range_streamer::get_all_ranges_with_strict_sources_for(const sstring& keyspace_n
|
|||||||
for (auto& desired_range : desired_ranges) {
|
for (auto& desired_range : desired_ranges) {
|
||||||
for (auto& x : range_addresses) {
|
for (auto& x : range_addresses) {
|
||||||
const range<token>& src_range = x.first;
|
const range<token>& src_range = x.first;
|
||||||
|
if (need_preempt()) {
|
||||||
|
seastar::thread::yield();
|
||||||
|
}
|
||||||
if (src_range.contains(desired_range, dht::tri_compare)) {
|
if (src_range.contains(desired_range, dht::tri_compare)) {
|
||||||
std::vector<inet_address> old_endpoints(x.second.begin(), x.second.end());
|
std::vector<inet_address> old_endpoints(x.second.begin(), x.second.end());
|
||||||
auto it = pending_range_addresses.find(desired_range);
|
auto it = pending_range_addresses.find(desired_range);
|
||||||
@@ -226,7 +232,8 @@ void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
|
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
|
||||||
void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
|
future<> range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
|
||||||
|
return seastar::async([this, keyspace_name, ranges= std::move(ranges)] () mutable {
|
||||||
if (_nr_tx_added) {
|
if (_nr_tx_added) {
|
||||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||||
}
|
}
|
||||||
@@ -249,6 +256,7 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
future<> range_streamer::stream_async() {
|
future<> range_streamer::stream_async() {
|
||||||
@@ -294,7 +302,7 @@ future<> range_streamer::do_stream_async() {
|
|||||||
size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
|
size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
|
||||||
dht::token_range_vector ranges_to_stream;
|
dht::token_range_vector ranges_to_stream;
|
||||||
auto do_streaming = [&] {
|
auto do_streaming = [&] {
|
||||||
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
|
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++), _reason);
|
||||||
logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
|
logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
|
||||||
description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
|
description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
|
||||||
if (_nr_rx_added) {
|
if (_nr_rx_added) {
|
||||||
|
|||||||
@@ -42,6 +42,7 @@
|
|||||||
#include "locator/snitch_base.hh"
|
#include "locator/snitch_base.hh"
|
||||||
#include "streaming/stream_plan.hh"
|
#include "streaming/stream_plan.hh"
|
||||||
#include "streaming/stream_state.hh"
|
#include "streaming/stream_state.hh"
|
||||||
|
#include "streaming/stream_reason.hh"
|
||||||
#include "gms/inet_address.hh"
|
#include "gms/inet_address.hh"
|
||||||
#include "gms/i_failure_detector.hh"
|
#include "gms/i_failure_detector.hh"
|
||||||
#include "range.hh"
|
#include "range.hh"
|
||||||
@@ -101,24 +102,25 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description)
|
range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description, streaming::stream_reason reason)
|
||||||
: _db(db)
|
: _db(db)
|
||||||
, _metadata(tm)
|
, _metadata(tm)
|
||||||
, _tokens(std::move(tokens))
|
, _tokens(std::move(tokens))
|
||||||
, _address(address)
|
, _address(address)
|
||||||
, _description(std::move(description))
|
, _description(std::move(description))
|
||||||
|
, _reason(reason)
|
||||||
, _stream_plan(_description) {
|
, _stream_plan(_description) {
|
||||||
}
|
}
|
||||||
|
|
||||||
range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description)
|
range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description, streaming::stream_reason reason)
|
||||||
: range_streamer(db, tm, std::unordered_set<token>(), address, description) {
|
: range_streamer(db, tm, std::unordered_set<token>(), address, description, reason) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_source_filter(std::unique_ptr<i_source_filter> filter) {
|
void add_source_filter(std::unique_ptr<i_source_filter> filter) {
|
||||||
_source_filters.emplace(std::move(filter));
|
_source_filters.emplace(std::move(filter));
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
|
future<> add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
|
||||||
void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
|
void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
|
||||||
void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
|
void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint);
|
||||||
private:
|
private:
|
||||||
@@ -166,6 +168,7 @@ private:
|
|||||||
std::unordered_set<token> _tokens;
|
std::unordered_set<token> _tokens;
|
||||||
inet_address _address;
|
inet_address _address;
|
||||||
sstring _description;
|
sstring _description;
|
||||||
|
streaming::stream_reason _reason;
|
||||||
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
|
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
|
||||||
std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
|
std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
|
||||||
stream_plan _stream_plan;
|
stream_plan _stream_plan;
|
||||||
|
|||||||
2
dist/ami/build_ami.sh
vendored
2
dist/ami/build_ami.sh
vendored
@@ -78,7 +78,7 @@ if [ $LOCALRPM -eq 1 ]; then
|
|||||||
fi
|
fi
|
||||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||||
cd build
|
cd build
|
||||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
git clone -b branch-3.0 --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||||
cd scylla-jmx
|
cd scylla-jmx
|
||||||
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
||||||
cd ../..
|
cd ../..
|
||||||
|
|||||||
2
dist/ami/scylla.json
vendored
2
dist/ami/scylla.json
vendored
@@ -68,7 +68,7 @@
|
|||||||
"type": "shell",
|
"type": "shell",
|
||||||
"inline": [
|
"inline": [
|
||||||
"sudo yum install -y epel-release",
|
"sudo yum install -y epel-release",
|
||||||
"sudo yum install -y python34",
|
"sudo yum install -y python36",
|
||||||
"sudo /home/{{user `ssh_username`}}/scylla_install_ami {{ user `install_args` }}"
|
"sudo /home/{{user `ssh_username`}}/scylla_install_ami {{ user `install_args` }}"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
2
dist/common/scripts/node_exporter_install
vendored
2
dist/common/scripts/node_exporter_install
vendored
@@ -25,7 +25,7 @@ import tempfile
|
|||||||
import tarfile
|
import tarfile
|
||||||
from scylla_util import *
|
from scylla_util import *
|
||||||
|
|
||||||
VERSION='0.14.0'
|
VERSION='0.17.0'
|
||||||
INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'
|
INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
7
dist/common/scripts/scylla_prepare
vendored
7
dist/common/scripts/scylla_prepare
vendored
@@ -62,10 +62,9 @@ if __name__ == '__main__':
|
|||||||
run('hugeadm --create-mounts')
|
run('hugeadm --create-mounts')
|
||||||
fi
|
fi
|
||||||
else:
|
else:
|
||||||
set_nic = cfg.get('SET_NIC')
|
set_nic_and_disks = get_set_nic_and_disks_config_value(cfg)
|
||||||
ifname = cfg.get('IFNAME')
|
ifname = cfg.get('IFNAME')
|
||||||
if set_nic == 'yes':
|
if set_nic_and_disks == 'yes':
|
||||||
create_perftune_conf(ifname)
|
create_perftune_conf(ifname)
|
||||||
run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
|
run("{} --options-file /etc/scylla.d/perftune.yaml".format(perftune_base_command()))
|
||||||
|
|
||||||
run('/usr/lib/scylla/scylla-blocktune')
|
|
||||||
|
|||||||
12
dist/common/scripts/scylla_setup
vendored
12
dist/common/scripts/scylla_setup
vendored
@@ -122,8 +122,8 @@ if __name__ == '__main__':
|
|||||||
help='specify NTP domain')
|
help='specify NTP domain')
|
||||||
parser.add_argument('--ami', action='store_true', default=False,
|
parser.add_argument('--ami', action='store_true', default=False,
|
||||||
help='setup AMI instance')
|
help='setup AMI instance')
|
||||||
parser.add_argument('--setup-nic', action='store_true', default=False,
|
parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
|
||||||
help='optimize NIC queue')
|
help='optimize NIC and disks')
|
||||||
parser.add_argument('--developer-mode', action='store_true', default=False,
|
parser.add_argument('--developer-mode', action='store_true', default=False,
|
||||||
help='enable developer mode')
|
help='enable developer mode')
|
||||||
parser.add_argument('--no-ec2-check', action='store_true', default=False,
|
parser.add_argument('--no-ec2-check', action='store_true', default=False,
|
||||||
@@ -173,7 +173,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
disks = args.disks
|
disks = args.disks
|
||||||
nic = args.nic
|
nic = args.nic
|
||||||
set_nic = args.setup_nic
|
set_nic_and_disks = args.setup_nic_and_disks
|
||||||
ec2_check = not args.no_ec2_check
|
ec2_check = not args.no_ec2_check
|
||||||
kernel_check = not args.no_kernel_check
|
kernel_check = not args.no_kernel_check
|
||||||
verify_package = not args.no_verify_package
|
verify_package = not args.no_verify_package
|
||||||
@@ -336,11 +336,11 @@ if __name__ == '__main__':
|
|||||||
if interactive:
|
if interactive:
|
||||||
sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
|
sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
|
||||||
if sysconfig_setup:
|
if sysconfig_setup:
|
||||||
nic = interactive_choose_nic()
|
|
||||||
if interactive:
|
if interactive:
|
||||||
set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
|
nic = interactive_choose_nic()
|
||||||
|
set_nic_and_disks = interactive_ask_service('Do you want to enable Network Interface Card (NIC) and disk(s) optimization?', 'Yes - optimize the NIC queue and disks settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
|
||||||
if sysconfig_setup:
|
if sysconfig_setup:
|
||||||
setup_args = '--setup-nic' if set_nic else ''
|
setup_args = '--setup-nic-and-disks' if set_nic_and_disks else ''
|
||||||
run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))
|
run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))
|
||||||
|
|
||||||
if interactive:
|
if interactive:
|
||||||
|
|||||||
19
dist/common/scripts/scylla_sysconfig_setup
vendored
19
dist/common/scripts/scylla_sysconfig_setup
vendored
@@ -40,7 +40,7 @@ if __name__ == '__main__':
|
|||||||
cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
|
cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
|
||||||
else:
|
else:
|
||||||
cfg = sysconfig_parser('/etc/default/scylla-server')
|
cfg = sysconfig_parser('/etc/default/scylla-server')
|
||||||
set_nic = str2bool(cfg.get('SET_NIC'))
|
set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
|
||||||
ami = str2bool(cfg.get('AMI'))
|
ami = str2bool(cfg.get('AMI'))
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
|
parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
|
||||||
@@ -58,8 +58,8 @@ if __name__ == '__main__':
|
|||||||
help='scylla home directory')
|
help='scylla home directory')
|
||||||
parser.add_argument('--confdir',
|
parser.add_argument('--confdir',
|
||||||
help='scylla config directory')
|
help='scylla config directory')
|
||||||
parser.add_argument('--setup-nic', action='store_true', default=set_nic,
|
parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
|
||||||
help='setup NIC\'s interrupts, RPS, XPS')
|
help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
|
||||||
parser.add_argument('--ami', action='store_true', default=ami,
|
parser.add_argument('--ami', action='store_true', default=ami,
|
||||||
help='AMI instance mode')
|
help='AMI instance mode')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -71,8 +71,8 @@ if __name__ == '__main__':
|
|||||||
ifname = args.nic if args.nic else cfg.get('IFNAME')
|
ifname = args.nic if args.nic else cfg.get('IFNAME')
|
||||||
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
|
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
|
||||||
|
|
||||||
if args.setup_nic:
|
if args.setup_nic_and_disks:
|
||||||
rps_cpus = out('/usr/lib/scylla/posix_net_conf.sh --cpu-mask {}'.format(ifname))
|
rps_cpus = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
|
||||||
if len(rps_cpus) > 0:
|
if len(rps_cpus) > 0:
|
||||||
cpuset = hex2list(rps_cpus)
|
cpuset = hex2list(rps_cpus)
|
||||||
run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
|
run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
|
||||||
@@ -104,8 +104,13 @@ if __name__ == '__main__':
|
|||||||
cfg.set('SCYLLA_HOME', args.homedir)
|
cfg.set('SCYLLA_HOME', args.homedir)
|
||||||
if args.confdir:
|
if args.confdir:
|
||||||
cfg.set('SCYLLA_CONF', args.confdir)
|
cfg.set('SCYLLA_CONF', args.confdir)
|
||||||
if str2bool(cfg.get('SET_NIC')) != args.setup_nic:
|
|
||||||
cfg.set('SET_NIC', bool2str(args.setup_nic))
|
if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
|
||||||
|
if cfg.has_option('SET_NIC'):
|
||||||
|
cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
|
||||||
|
else:
|
||||||
|
cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
|
||||||
|
|
||||||
if str2bool(cfg.get('AMI')) != args.ami:
|
if str2bool(cfg.get('AMI')) != args.ami:
|
||||||
cfg.set('AMI', bool2str(args.ami))
|
cfg.set('AMI', bool2str(args.ami))
|
||||||
cfg.commit()
|
cfg.commit()
|
||||||
|
|||||||
58
dist/common/scripts/scylla_util.py
vendored
58
dist/common/scripts/scylla_util.py
vendored
@@ -28,6 +28,7 @@ import time
|
|||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
def curl(url, byte=False):
|
def curl(url, byte=False):
|
||||||
@@ -384,6 +385,37 @@ def get_mode_cpuset(nic, mode):
|
|||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
return '-1'
|
return '-1'
|
||||||
|
|
||||||
|
def get_scylla_dirs():
|
||||||
|
"""
|
||||||
|
Returns a list of scylla directories configured in /etc/scylla/scylla.yaml.
|
||||||
|
Verifies that mandatory parameters are set.
|
||||||
|
"""
|
||||||
|
scylla_yaml_name = '/etc/scylla/scylla.yaml'
|
||||||
|
y = yaml.load(open(scylla_yaml_name))
|
||||||
|
|
||||||
|
# Check that mandatory fields are set
|
||||||
|
if 'data_file_directories' not in y or \
|
||||||
|
not y['data_file_directories'] or \
|
||||||
|
not len(y['data_file_directories']) or \
|
||||||
|
not " ".join(y['data_file_directories']).strip():
|
||||||
|
raise Exception("{}: at least one directory has to be set in 'data_file_directory'".format(scylla_yaml_name))
|
||||||
|
if 'commitlog_directory' not in y or not y['commitlog_directory']:
|
||||||
|
raise Exception("{}: 'commitlog_directory' has to be set".format(scylla_yaml_name))
|
||||||
|
|
||||||
|
dirs = []
|
||||||
|
dirs.extend(y['data_file_directories'])
|
||||||
|
dirs.append(y['commitlog_directory'])
|
||||||
|
|
||||||
|
if 'hints_directory' in y and y['hints_directory']:
|
||||||
|
dirs.append(y['hints_directory'])
|
||||||
|
if 'view_hints_directory' in y and y['view_hints_directory']:
|
||||||
|
dirs.append(y['view_hints_directory'])
|
||||||
|
|
||||||
|
return [d for d in dirs if d is not None]
|
||||||
|
|
||||||
|
def perftune_base_command():
|
||||||
|
disk_tune_param = "--tune disks " + " ".join("--dir {}".format(d) for d in get_scylla_dirs())
|
||||||
|
return '/usr/lib/scylla/perftune.py {}'.format(disk_tune_param)
|
||||||
|
|
||||||
def get_cur_cpuset():
|
def get_cur_cpuset():
|
||||||
cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
|
cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
|
||||||
@@ -417,8 +449,29 @@ def create_perftune_conf(nic='eth0'):
|
|||||||
|
|
||||||
|
|
||||||
def is_valid_nic(nic):
|
def is_valid_nic(nic):
|
||||||
|
if len(nic) == 0:
|
||||||
|
return False
|
||||||
return os.path.exists('/sys/class/net/{}'.format(nic))
|
return os.path.exists('/sys/class/net/{}'.format(nic))
|
||||||
|
|
||||||
|
# Remove this when we do not support SET_NIC configuration value anymore
|
||||||
|
def get_set_nic_and_disks_config_value(cfg):
|
||||||
|
"""
|
||||||
|
Get the SET_NIC_AND_DISKS configuration value.
|
||||||
|
Return the SET_NIC configuration value if SET_NIC_AND_DISKS is not found (old releases case).
|
||||||
|
:param cfg: sysconfig_parser object
|
||||||
|
:return configuration value
|
||||||
|
:except If the configuration value is not found
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Sanity check
|
||||||
|
if cfg.has_option('SET_NIC_AND_DISKS') and cfg.has_option('SET_NIC'):
|
||||||
|
raise Exception("Only one of 'SET_NIC_AND_DISKS' and 'SET_NIC' is allowed to be present")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return cfg.get('SET_NIC_AND_DISKS')
|
||||||
|
except:
|
||||||
|
# For backwards compatibility
|
||||||
|
return cfg.get('SET_NIC')
|
||||||
|
|
||||||
class SystemdException(Exception):
|
class SystemdException(Exception):
|
||||||
pass
|
pass
|
||||||
@@ -483,8 +536,11 @@ class sysconfig_parser:
|
|||||||
def get(self, key):
|
def get(self, key):
|
||||||
return self._cfg.get('global', key).strip('"')
|
return self._cfg.get('global', key).strip('"')
|
||||||
|
|
||||||
|
def has_option(self, key):
|
||||||
|
return self._cfg.has_option('global', key)
|
||||||
|
|
||||||
def set(self, key, val):
|
def set(self, key, val):
|
||||||
if not self._cfg.has_option('global', key):
|
if not self.has_option(key):
|
||||||
return self.__add(key, val)
|
return self.__add(key, val)
|
||||||
self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
|
self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
|
||||||
self.__load()
|
self.__load()
|
||||||
|
|||||||
4
dist/common/sysconfig/scylla-server
vendored
4
dist/common/sysconfig/scylla-server
vendored
@@ -10,8 +10,8 @@ BRIDGE=virbr0
|
|||||||
# ethernet device name
|
# ethernet device name
|
||||||
IFNAME=eth0
|
IFNAME=eth0
|
||||||
|
|
||||||
# setup NIC's interrupts, RPS, XPS (posix)
|
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||||
SET_NIC=no
|
SET_NIC_AND_DISKS=no
|
||||||
|
|
||||||
# ethernet device driver (dpdk)
|
# ethernet device driver (dpdk)
|
||||||
ETHDRV=
|
ETHDRV=
|
||||||
|
|||||||
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# Raise max AIO events
|
||||||
|
fs.aio-max-nr = 1048576
|
||||||
2
dist/common/systemd/node-exporter.service
vendored
2
dist/common/systemd/node-exporter.service
vendored
@@ -5,7 +5,7 @@ Description=Node Exporter
|
|||||||
Type=simple
|
Type=simple
|
||||||
User=scylla
|
User=scylla
|
||||||
Group=scylla
|
Group=scylla
|
||||||
ExecStart=/usr/bin/node_exporter -collectors.enabled interrupts,conntrack,diskstats,entropy,filefd,filesystem,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat
|
ExecStart=/usr/bin/node_exporter --collector.interrupts
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -6,7 +6,12 @@ After=network.target
|
|||||||
Type=simple
|
Type=simple
|
||||||
User=scylla
|
User=scylla
|
||||||
Group=scylla
|
Group=scylla
|
||||||
|
{{#debian}}
|
||||||
|
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode r
|
||||||
|
{{/debian}}
|
||||||
|
{{#redhat}}
|
||||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
|
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
|
||||||
|
{{/redhat}}
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
|
dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
|
||||||
|
dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
|
|||||||
else
|
else
|
||||||
# expect failures in virtualized environments
|
# expect failures in virtualized environments
|
||||||
sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
|
sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
|
||||||
|
sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#DEBHELPER#
|
#DEBHELPER#
|
||||||
|
|||||||
1
dist/debian/debian/scylla-server.dirs
vendored
1
dist/debian/debian/scylla-server.dirs
vendored
@@ -4,5 +4,6 @@ var/lib/scylla
|
|||||||
var/lib/scylla/data
|
var/lib/scylla/data
|
||||||
var/lib/scylla/commitlog
|
var/lib/scylla/commitlog
|
||||||
var/lib/scylla/hints
|
var/lib/scylla/hints
|
||||||
|
var/lib/scylla/view_hints
|
||||||
var/lib/scylla/coredump
|
var/lib/scylla/coredump
|
||||||
var/lib/scylla-housekeeping
|
var/lib/scylla-housekeeping
|
||||||
|
|||||||
2
dist/debian/rules.mustache
vendored
2
dist/debian/rules.mustache
vendored
@@ -4,7 +4,7 @@ export PYBUILD_DISABLE=1
|
|||||||
jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")
|
jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")
|
||||||
|
|
||||||
override_dh_auto_configure:
|
override_dh_auto_configure:
|
||||||
./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
|
./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --c-compiler=/opt/scylladb/bin/gcc-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
|
||||||
|
|
||||||
override_dh_auto_build:
|
override_dh_auto_build:
|
||||||
PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
|
PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
|
||||||
|
|||||||
1
dist/debian/scylla-server.install.mustache
vendored
1
dist/debian/scylla-server.install.mustache
vendored
@@ -1,7 +1,6 @@
|
|||||||
dist/common/limits.d/scylla.conf etc/security/limits.d
|
dist/common/limits.d/scylla.conf etc/security/limits.d
|
||||||
dist/common/scylla.d/*.conf etc/scylla.d
|
dist/common/scylla.d/*.conf etc/scylla.d
|
||||||
seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
|
seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
|
||||||
seastar/scripts/posix_net_conf.sh usr/lib/scylla
|
|
||||||
seastar/scripts/perftune.py usr/lib/scylla
|
seastar/scripts/perftune.py usr/lib/scylla
|
||||||
dist/common/scripts/* usr/lib/scylla
|
dist/common/scripts/* usr/lib/scylla
|
||||||
scylla-housekeeping usr/lib/scylla
|
scylla-housekeeping usr/lib/scylla
|
||||||
|
|||||||
4
dist/docker/redhat/Dockerfile
vendored
4
dist/docker/redhat/Dockerfile
vendored
@@ -26,14 +26,14 @@ ADD commandlineparser.py /commandlineparser.py
|
|||||||
ADD docker-entrypoint.py /docker-entrypoint.py
|
ADD docker-entrypoint.py /docker-entrypoint.py
|
||||||
|
|
||||||
# Install Scylla:
|
# Install Scylla:
|
||||||
RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
|
RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.repos.d/scylla.repo && \
|
||||||
yum -y install epel-release && \
|
yum -y install epel-release && \
|
||||||
yum -y clean expire-cache && \
|
yum -y clean expire-cache && \
|
||||||
yum -y update && \
|
yum -y update && \
|
||||||
yum -y remove boost-thread boost-system && \
|
yum -y remove boost-thread boost-system && \
|
||||||
yum -y install scylla hostname supervisor && \
|
yum -y install scylla hostname supervisor && \
|
||||||
yum clean all && \
|
yum clean all && \
|
||||||
yum -y install python34 python34-PyYAML && \
|
yum -y install python36 python36-PyYAML && \
|
||||||
cat /scylla_bashrc >> /etc/bashrc && \
|
cat /scylla_bashrc >> /etc/bashrc && \
|
||||||
mkdir -p /etc/supervisor.conf.d && \
|
mkdir -p /etc/supervisor.conf.d && \
|
||||||
mkdir -p /var/log/scylla && \
|
mkdir -p /var/log/scylla && \
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ BRIDGE=virbr0
|
|||||||
# ethernet device name
|
# ethernet device name
|
||||||
IFNAME=eth0
|
IFNAME=eth0
|
||||||
|
|
||||||
# setup NIC's interrupts, RPS, XPS (posix)
|
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||||
SET_NIC=no
|
SET_NIC_AND_DISKS=no
|
||||||
|
|
||||||
# ethernet device driver (dpdk)
|
# ethernet device driver (dpdk)
|
||||||
ETHDRV=
|
ETHDRV=
|
||||||
|
|||||||
@@ -91,7 +91,27 @@ mkdir -p build/offline_installer
|
|||||||
cp dist/offline_installer/redhat/header build/offline_installer
|
cp dist/offline_installer/redhat/header build/offline_installer
|
||||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
|
||||||
# XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
|
# XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sudo.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntp.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libedit.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntpdate.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve net-tools.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve kernel
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve grubby.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve linux-firmware
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve initscripts.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iproute.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iptables.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnfnetlink.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnetfilter_conntrack.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libmnl.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sysvinit-tools.x86_64
|
||||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve mdadm.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libreport-filesystem.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve xfsprogs.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve PyYAML.x86_64
|
||||||
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libyaml.x86_64
|
||||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
|
||||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
|
||||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
|
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
|
||||||
|
|||||||
4
dist/redhat/build_rpm.sh
vendored
4
dist/redhat/build_rpm.sh
vendored
@@ -108,11 +108,11 @@ fix_ownership() {
|
|||||||
if [ $JOBS -gt 0 ]; then
|
if [ $JOBS -gt 0 ]; then
|
||||||
RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
|
RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
|
||||||
fi
|
fi
|
||||||
sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
|
sudo mock --rootdir=`pwd`/build/mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
|
||||||
fix_ownership build/srpms
|
fix_ownership build/srpms
|
||||||
if [[ "$TARGET" =~ ^epel-7- ]]; then
|
if [[ "$TARGET" =~ ^epel-7- ]]; then
|
||||||
TARGET=scylla-$TARGET
|
TARGET=scylla-$TARGET
|
||||||
RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
|
RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
|
||||||
fi
|
fi
|
||||||
sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
|
sudo mock --rootdir=`pwd`/build/mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
|
||||||
fix_ownership build/rpms
|
fix_ownership build/rpms
|
||||||
|
|||||||
9
dist/redhat/scylla.spec.mustache
vendored
9
dist/redhat/scylla.spec.mustache
vendored
@@ -56,9 +56,9 @@ License: AGPLv3
|
|||||||
URL: http://www.scylladb.com/
|
URL: http://www.scylladb.com/
|
||||||
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
|
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
|
||||||
%{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
|
%{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
|
||||||
%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
|
%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python36 scylla-gcc73-c++, scylla-python36-pyparsing20 yaml-cpp-static pystache python-setuptools}
|
||||||
Requires: {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
|
Requires: {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
|
||||||
%{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
|
%{?rhel:Requires: python36 python36-PyYAML kernel >= 3.10.0-514}
|
||||||
%{?fedora:Requires: python3 python3-PyYAML}
|
%{?fedora:Requires: python3 python3-PyYAML}
|
||||||
Conflicts: abrt
|
Conflicts: abrt
|
||||||
%ifarch x86_64
|
%ifarch x86_64
|
||||||
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
|
|||||||
%endif
|
%endif
|
||||||
%if 0%{?rhel}
|
%if 0%{?rhel}
|
||||||
. /etc/profile.d/scylla.sh
|
. /etc/profile.d/scylla.sh
|
||||||
python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
python3.6 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.6 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||||
%endif
|
%endif
|
||||||
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
||||||
|
|
||||||
@@ -193,7 +193,6 @@ rm -rf $RPM_BUILD_ROOT
|
|||||||
%{_prefix}/lib/scylla/scylla_cpuscaling_setup
|
%{_prefix}/lib/scylla/scylla_cpuscaling_setup
|
||||||
%{_prefix}/lib/scylla/scylla_fstrim
|
%{_prefix}/lib/scylla/scylla_fstrim
|
||||||
%{_prefix}/lib/scylla/scylla_fstrim_setup
|
%{_prefix}/lib/scylla/scylla_fstrim_setup
|
||||||
%{_prefix}/lib/scylla/posix_net_conf.sh
|
|
||||||
%{_prefix}/lib/scylla/perftune.py
|
%{_prefix}/lib/scylla/perftune.py
|
||||||
%{_prefix}/lib/scylla/dpdk-devbind.py
|
%{_prefix}/lib/scylla/dpdk-devbind.py
|
||||||
%{_prefix}/lib/scylla/hex2list.py
|
%{_prefix}/lib/scylla/hex2list.py
|
||||||
@@ -209,6 +208,7 @@ rm -rf $RPM_BUILD_ROOT
|
|||||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/data
|
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/data
|
||||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/commitlog
|
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/commitlog
|
||||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/hints
|
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/hints
|
||||||
|
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/view_hints
|
||||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/coredump
|
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/coredump
|
||||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
|
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
|
||||||
%ghost /etc/systemd/system/scylla-server.service.d/
|
%ghost /etc/systemd/system/scylla-server.service.d/
|
||||||
@@ -283,6 +283,7 @@ if Scylla is the main application on your server and you wish to optimize its la
|
|||||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||||
# following is a "manual" expansion
|
# following is a "manual" expansion
|
||||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||||
|
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||||
|
|
||||||
%files kernel-conf
|
%files kernel-conf
|
||||||
%defattr(-,root,root)
|
%defattr(-,root,root)
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ You can use Docker volumes to improve performance of Scylla.
|
|||||||
Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:
|
Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints
|
$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints /var/lib/scylla/view_hints
|
||||||
```
|
```
|
||||||
|
|
||||||
Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
|
Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
|
||||||
|
|||||||
@@ -41,12 +41,11 @@ struct encoding_stats {
|
|||||||
// int DELETION_TIME_EPOCH = (int)(c.getTimeInMillis() / 1000); // local deletion times are in seconds
|
// int DELETION_TIME_EPOCH = (int)(c.getTimeInMillis() / 1000); // local deletion times are in seconds
|
||||||
// Encoding stats are used for delta-encoding, so we want some default values
|
// Encoding stats are used for delta-encoding, so we want some default values
|
||||||
// that are just good enough so we take some recent date in the past
|
// that are just good enough so we take some recent date in the past
|
||||||
static constexpr uint32_t deletion_time_epoch = 1442880000;
|
static constexpr int32_t deletion_time_epoch = 1442880000;
|
||||||
static constexpr api::timestamp_type timestamp_epoch = api::timestamp_type(deletion_time_epoch) * 1000 * 1000;
|
static constexpr api::timestamp_type timestamp_epoch = api::timestamp_type(deletion_time_epoch) * 1000 * 1000;
|
||||||
static constexpr uint32_t ttl_epoch = 0;
|
static constexpr int32_t ttl_epoch = 0;
|
||||||
|
|
||||||
api::timestamp_type min_timestamp = timestamp_epoch;
|
api::timestamp_type min_timestamp = timestamp_epoch;
|
||||||
uint32_t min_local_deletion_time = deletion_time_epoch;
|
int32_t min_local_deletion_time = deletion_time_epoch;
|
||||||
uint32_t min_ttl = ttl_epoch;
|
int32_t min_ttl = ttl_epoch;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user