sstable_writer may depend on the sstable throughout its whole lifecycle.
If the sstable is freed before the sstable_writer we might hit use-after-free
as in the follwing case:
```
std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator+=(long) at /usr/include/c++/10/bits/stl_deque.h:240
(inlined by) std::operator+(std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*> const&, long) at /usr/include/c++/10/bits/stl_deque.h:378
(inlined by) std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator[](long) const at /usr/include/c++/10/bits/stl_deque.h:252
(inlined by) std::deque<sstables::compression::segmented_offsets::bucket, std::allocator<sstables::compression::segmented_offsets::bucket> >::operator[](unsigned long) at /usr/include/c++/10/bits/stl_deque.h:1327
(inlined by) sstables::compression::segmented_offsets::push_back(unsigned long, sstables::compression::segmented_offsets::state&) at ./sstables/compress.cc:214
sstables::compression::segmented_offsets::writer::push_back(unsigned long) at ./sstables/compress.hh:123
(inlined by) compressed_file_data_sink_impl<crc32_utils, (compressed_checksum_mode)1>::put(seastar::temporary_buffer<char>) at ./sstables/compress.cc:519
seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at table.cc:?
(inlined by) seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at ././seastar/include/seastar/core/iostream-impl.hh:432
seastar::output_stream<char>::flush() at table.cc:?
seastar::output_stream<char>::close() at table.cc:?
sstables::file_writer::close() at sstables.cc:?
sstables::mc::writer::~writer() at writer.cc:?
(inlined by) sstables::mc::writer::~writer() at ./sstables/mx/writer.cc:790
sstables::mc::writer::~writer() at writer.cc:?
flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at compaction.cc:?
(inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_destroy() at /usr/include/c++/10/optional:260
(inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_reset() at /usr/include/c++/10/optional:280
(inlined by) std::_Optional_payload<sstables::compaction_writer, false, false, false>::~_Optional_payload() at /usr/include/c++/10/optional:401
(inlined by) std::_Optional_base<sstables::compaction_writer, false, false>::~_Optional_base() at /usr/include/c++/10/optional:474
(inlined by) std::optional<sstables::compaction_writer>::~optional() at /usr/include/c++/10/optional:659
(inlined by) sstables::compacting_sstable_writer::~compacting_sstable_writer() at ./sstables/compaction.cc:229
(inlined by) compact_mutation<(emit_only_live_rows)0, (compact_for_sstables)1, sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_mutation() at ././mutation_compactor.hh:468
(inlined by) compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_for_compaction() at ././mutation_compactor.hh:538
(inlined by) std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::operator()(compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>*) const at /usr/include/c++/10/bits/unique_ptr.h:85
(inlined by) std::unique_ptr<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>, std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~unique_ptr() at /usr/include/c++/10/bits/unique_ptr.h:361
(inlined by) stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::~stable_flattened_mutations_consumer() at ././mutation_reader.hh:342
(inlined by) flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at ././flat_mutation_reader.hh:201
auto flat_mutation_reader::impl::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:272
(inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:383
(inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:389
(inlined by) seastar::future<void> sstables::compaction::setup<noop_compacted_fragments_consumer>(noop_compacted_fragments_consumer)::{lambda(flat_mutation_reader)#1}::operator()(flat_mutation_reader)::{lambda()#1}::operator()() at ./sstables/compaction.cc:612
```
What happens here is that:
compressed_file_data_sink_impl(output_stream<char> out, sstables::compression* cm, sstables::local_compression lc)
: _out(std::move(out))
, _compression_metadata(cm)
, _offsets(_compression_metadata->offsets.get_writer())
, _compression(lc)
, _full_checksum(ChecksumType::init_checksum())
_compression_metadata points to a buffer held by the sstable object.
and _compression_metadata->offsets.get_writer returns a writer that keeps
a reference to the segmented_offsets in the sstables::compression
that is used in the ~writer -> close path.
Fixes#7821
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20201227145726.33319-1-bhalevy@scylladb.com>
(cherry picked from commit 8a745a0ee0)
feed_writer() eats exception and transforms it into an end of stream
instead. Downstream validators hate when this happens.
Fixes#7482
Message-Id: <20201216090038.GB3244976@scylladb.com>
(cherry picked from commit 61520a33d6)
aws_instance.ebs_disks() method should return ebs disk
instead of ephemeral
Signed-off-by: Aleksandr Bykov <alex.bykov@scylladb.com>
Closes#7780
(cherry picked from commit e74dc311e7)
Fixes#6459
When moving or removing endpoints, we should ensure
that the set of available racks reflect the nodes
known, i.e. match what would be the result of a
reboot + create sets initially.
Message-Id: <20200519153300.15391-1-calle@scylladb.com>
(cherry picked from commit 7ce4a8b458)
tuned 2.11.0-9 and later writes to kerned.sched_wakeup_granularity_ns
and other sysctl tunables that we so laboriously tuned, dropping
performance by a factor of 5 (due to increased latency). Fix by
obsoleting tuned during install (in effect, we are a better tuned,
at least for us).
Not needed for .deb, since debian/ubunto do not install tuned by
default.
Fixes#7696Closes#7776
(cherry picked from commit 615b8e8184)
We used to calculate the number of endpoints for quorum and local_quorum
unconditionally as ((rf / 2) + 1). This formula doesn't take into
account the corner case where RF = 0, in this situation quorum should
also be 0.
This commit adds the missing corner case.
Tests: Unit Tests (dev)
Fixes#6905Closes#7296
(cherry picked from commit 925cdc9ae1)
The future of the fiber that writes data into sstables inside
the repair_writer is stored in _writer_done like below:
class repair_writer {
_writer_done[node_idx] =
mutation_writer::distribute_reader_and_consume_on_shards().then([this] {
...
}).handle_exception([this] {
...
});
}
The fiber access repair_writer object in the error handling path. We
wait for the _writer_done to finish before we destroy repair_meta
object which contains the repair_writer object to avoid the fiber
accessing already freed repair_writer object.
To be safer, we can make repair_writer a shared pointer and take a
reference in the distribute_reader_and_consume_on_shards code path.
Fixes#7406Closes#7430
(cherry picked from commit 289a08072a)
Before updating the _last_[cp]key (for subsequent .fetch_page())
the pager checks is 'if the pager is not exhausted OR the result
has data'.
The check seems broken: if the pager is not exhausted, but the
result is empty the call for keys will unconditionally try to
reference the last element from empty vector. The not exhausted
condition for empty result can happen if the short_read is set,
which, in turn, unconditionally happens upon meeting partition
end when visiting the partition with result builder.
The correct check should be 'if the pager is not exhausted AND
the result has data': the _last_[pc]key-s should be taken for
continuation (not exhausted), but can be taken if the result is
not empty (has data).
fixes: #7263
tests: unit(dev), but tests don't trigger this corner case
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200921124329.21209-1-xemul@scylladb.com>
(cherry picked from commit 550fc734d9)
We currently set PATH for relocatable CLI tools in scylla_util.run() and
scylla_util.out(), but it doesn't work for perftune.py, since it's not part of
Scylla, does not use scylla_util module.
We can set PATH in python thunk instead, it can set PATH for all python scripts.
Fixes#7350
(cherry picked from commit 5867af4edd)
Retry mechanism didn't work when URLError happend. For example:
urllib.error.URLError: <urlopen error [Errno 101] Network is unreachable>
Let's catch URLError instead of HTTP since URLError is a base exception
for all exceptions in the urllib module.
Fixes: #7569Closes#7567
(cherry picked from commit 956b97b2a8)
When we introduced dependencies.conf, we mistakenly added it on rpm as %ghost,
but it should be normal file, should be installed normally on package installation.
Fixes#7703Closes#7704
(cherry picked from commit ba4d54efa3)
We don't apply sysctl.d files on non-packaging installation, apply them
just like rpm/deb taking care of that.
Fixes#7702Closes#7705
(cherry picked from commit 5f81f97773)
Since f3bcd4d205 ("Merge 'Support SSL Certificate Hot
Reloading' from Calle"), we reload certificates as they are
modified on disk. This uses inotify, which is limited by a
sysctl fs.inotify.max_user_instances, with a default of 128.
This is enough for 64 shards only, if both rpc and cql are
encrypted; above that startup fails.
Increase to 1200, which is enough for 6 instances * 200 shards.
Fixes#7700.
Closes#7701
(cherry picked from commit 390e07d591)
If interposer consumer is enabled, partition filtering will be done by the
consumer instead, but that's not possible because only the producer is able
to skip to the next partition if the current one is filtered out, so scylla
crashes when that happens with a bad function call in queue_reader.
This is a regression which started here: 55a8b6e3c9
To fix this problem, let's make sure that partition filtering will only
happen on the producer side.
Fixes#7590.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20201111221513.312283-1-raphaelsc@scylladb.com>
(cherry picked from commit 13fa2bec4c)
When there are hint files to be sent and the target endpoint is DOWN,
end_point_hints_manager works in the following loop:
- It reads the first hint file in the queue,
- For each hint in the file it decides that it won't be sent because the
target endpoint is DOWN,
- After realizing that there are some unsent hints, it decides to retry
this operation after sleeping 1 second.
This causes the first segment to be wholly read over and over again,
with 1 second pauses, until the target endpoint becomes UP or leaves the
cluster. This causes unnecessary I/O load in the streaming scheduling
group.
This patch adds a check which prevents end_point_hints_manager from
reading the first hint file at all when it is not allowed to send hints.
First observed in #6964
Tests:
- unit(dev)
- hinted handoff dtests
Closes#7407
(cherry picked from commit 77a0f1a153)
If the consumer happens to check the EOS flag before it hits the
exception injected by the abort (by calling fill_buffer()), they can
think the stream ended normally and expect it to be valid. However this
is not guaranteed when the reader is aborted. To avoid consumers falsely
thinking the stream ended normally, don't set the EOS flag on abort at
all.
Additionally make sure the producer is aborted too on abort. In theory
this is not needed as they are the one initiating the abort, but better
to be safe then sorry.
Fixes: #7411
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20201102100732.35132-1-bdenes@scylladb.com>
(cherry picked from commit f5323b29d9)
Instead of eagerly linearizing all values as they are passed to
validate(), defer linearization to those validators that actually need
linearized values. Linearizing large values puts pressure on the memory
allocator with large contiguous allocation requests. This is something
we are trying to actively avoid, especially if it is not really neaded.
Turns out the types, whose validators really want linearized values are
a minority, as most validators just look at the size of the value, and
some like bytes don't need validation at all, while usually having large
values.
This is achieved by templating the validator struct on the view and
using the FragmentedRange concept to treat all passed in views
(`bytes_view` and `fragmented_temporary_buffer_view`) uniformly.
This patch makes no attempt at converting existing validators to work
with fragmented buffers, only trivial cases are converted. The major
offenders still left are ascii/utf8 and collections.
Fixes: #7318
Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20201007054524.909420-1-bdenes@scylladb.com>
(cherry picked from commit db56ae695c)
[avi: squashed ed6775c585 ("types: adjust
validation_visitor construction for clang") as gcc 9 in scylla 4.1
suffers from the same problem as clang 11]
This is a backport of PR #7469 that did not apply cleanly to 4.2 with a trivial conflict, another commit that touched one of the files but in a completely different region.
Closes#7480
* github.com:scylladb/scylla:
materialized views: add a base table reference if missing
view info: support partial match between base and view for only reading from view.
view info: guard against null dereference of the base info
(cherry picked from commit c74ba1bc36)
Hints writes are handled by storage_proxy in the exact same way
regular writes are, which in turn means that the same smp service
group is used for both. The problem is that it can lead to a priority
inversion where writes of the lower priority kind occupies a lot of
the semaphores units making the higher priority writes wait for an
empty slot.
This series adds a separate smp group for hints as well as a field
to pass the correct smp group to mutate_locally functions, and
then uses this field to properly classify the writes.
Fixes#7177
* eliransin-hint_priority_inversion:
Storage proxy: use hints smp group in mutate locally
Storage proxy: add a dedicated smp group for hints
(cherry picked from commit c075539fea)
[avi: replace std::bind_front() which is not available with this
compiler with a lambda that does the same]
Corresponding overload of `storage_proxy::mutate_locally`
was hardcoded to pass `db::commitlog::force_sync::no` to the
`database::apply`. Unhardcode it and substitute `force_sync::no`
to all existing call sites (as it were before).
`force_sync::yes` will be used later for paxos learn writes
when trying to apply mutations upgraded from an obsolete
schema version (similar to the current case when applying
locally a `frozen_mutation` stored in accepted proposal).
Tests: unit(dev)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200716124915.464789-1-pa.solodovnikov@scylladb.com>
(cherry picked from commit 5ff5df1afd)
Prerequisite for #7177.
This patch change the code that iterates over the metrics to use a copy
of the metrics names to make it safe to remove the metrics from the
metrics object.
Fixes#7488
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 52db99f25f)
Refs #7364
The number of tombstones can be large. As a stopgap measure to
just returning a source range (with keepalive), we can at least
alleviate the problem by using a chunked vector.
Closes#7433
(cherry picked from commit 4b65d67a1a)
Cleanup compaction is using consume_pausable_in_thread() to skip over
disowned partitions, which uses flat_mutation_reader::next_partition().
The implementation of next_partition() for the sstable reader has a
bug which may cause the following assertion failure:
scylla: sstables/mp_row_consumer.hh:422: row_consumer::proceed sstables::mp_row_consumer_k_l::flush(): Assertion `!_ready' failed.
This happens when the sstable reader's buffer gets full when we reach
the partition end. The last fragment of the partition won't be pushed
into the buffer but will stay in the _ready variable. When
next_partition() is called in this state, _ready will not be cleared
and the fragment will be carried over to the next partition. This will
cause assertion failure when the reader attempts to emit the first
fragment of the next partition.
The fix is to clear _ready when entering a partition, just like we
clear _range_tombstones there.
Fixes#7553.
Message-Id: <1604534702-12777-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit fb9b5cae05)
"
After data segregation feature, anything that cause out-of-order writes,
like read repair, can result in small updates to past time windows.
This causes compaction to be very aggressive because whenever a past time
window is updated like that, that time window is recompacted into a
single SSTable.
Users expect that once a window is closed, it will no longer be written
to, but that has changed since the introduction of the data segregation
future. We didn't anticipate the write amplification issues that the
feature would cause. To fix this problem, let's perform size-tiered
compaction on the windows that are no longer active and were updated
because data was segregated. The current behavior where the last active
window is merged into one file is kept. But thereafter, that same
window will only be compacted using STCS.
Fixes#6928.
"
* 'fix_twcs_agressiveness_after_data_segregation_v2' of github.com:raphaelsc/scylla:
compaction/twcs: improve further debug messages
compaction/twcs: Improve debug log which shows all windows
test: Check that TWCS properly performs size-tiered compaction on past windows
compaction/twcs: Make task estimation take into account the size-tiered behavior
compaction/stcs: Export static function that estimates pending tasks
compaction/stcs: Make get_buckets() static
compact/twcs: Perform size-tiered compaction on past time windows
compaction/twcs: Make strategy easier to extend by removing duplicated knowledge
compaction/twcs: Make newest_bucket() non-static
compaction/twcs: Move TWCS implementation into source file
(cherry picked from commit 6f986df458)
LCS and SCTS already have their own files, reducing the clutter in
compaction_strategy.cc. Do the same for TWCS. I am doing this in
preparation to add more functions.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200611230906.409023-6-glauber@scylladb.com>
(cherry picked from commit b0a0c207c3)
Prerequisite for #6928.
"
Issue https://github.com/scylladb/scylla/issues/7019 describes a problem of an ever-growing map of temporary values stored in query_options. In order to mitigate this kind of problems, the storage for temporary values is moved from an external data structure to the value views itself. This way, the temporary lives only as long as it's accessible and is automatically destroyed once a request finishes. The downside is that each temporary is now allocated separately, while previously they were bundled in a single byte stream.
Tests: unit(dev)
Fixes https://github.com/scylladb/scylla/issues/7019
"
7055297649 ("cql3: remove query_options::linearize and _temporaries")
is reverted from this backport since linearize() is still used in
this branch.
* psarna-move_temporaries_to_value_view:
cql3: remove query_options::linearize and _temporaries
cql3: remove make_temporary helper function
cql3: store temporaries in-place instead of in query_options
cql3: add temporary_value to value view
cql3: allow moving data out of raw_value
cql3: split values.hh into a .cc file
(cherry picked from commit 2b308a973f)
Old secondary index schemas did not have their idx_token column
marked as computed, and there already exists code which updates
them. Unfortunately, the fix itself contains an error and doesn't
fire if computed columns are not yet supported by the whole cluster,
which is a very common situation during upgrades.
Fixes#7515Closes#7516
(cherry picked from commit b66c285f94)
"
This series fixes a bug in `appending_hash<row>` that caused it to ignore any cells after the first NULL. It also adds a cluster feature which starts using the new hashing only after the whole cluster is aware of it. The series comes with tests, which reproduce the issue.
Fixes#4567
Based on #4574
"
* psarna-fix_ignoring_cells_after_null_in_appending_hash:
test: extend mutation_test for NULL values
tests/mutation: add reproducer for #4567
gms: add a cluster feature for fixed hashing
digest: add null values to row digest
mutation_partition: fix formatting
appending_hash<row>: make publicly visible
(cherry picked from commit 0e03c979d2)
Currently in all cases we first deduct the to-be-consumed resources,
then construct the `reader_resources` class to protect it (release it on
destruction). This is error prone as it relies on no exception being
thrown while constructing the `reader_resources`. Albeit the
`reader_resources` constructor is `noexcept` right now this might change
in the future and as the call sites relying on this are disconnected
from the declaration, the one modifying them might not notice.
To make this safe going forward, make the `reader_resources` a true RAII
class, consuming the units in its constructor and releasing them in its
destructor.
Refs: #7256
Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200922150625.1253798-1-bdenes@scylladb.com>
(cherry picked from commit a0107ba1c6)
Message-Id: <20200924081408.236353-1-bdenes@scylladb.com>
scylla-python3 causes segfault when non-default locale specified.
As workaround for this, we need to set LC_ALL=en_US.UTF_8 on python3 thunk.
Fixes#7408Closes#7414
(cherry picked from commit ff129ee030)
Unavailable exception means that operation was not started and it can be
retried safely. If lwt fails in the learn stage though it most
certainly means that its effect will be observable already. The patch
returns timeout exception instead which means uncertainty.
Fixes#7258
Message-Id: <20201001130724.GA2283830@scylladb.com>
(cherry picked from commit 3e8dbb3c09)
"
The view_info object, which is attached to the schema object of the
view, contains a data structure called
"base_non_pk_columns_in_view_pk". This data structure contains column
ids of the base table so is valid only for a particular version of the
base table schema. This data structure is used by materialized view
code to interpret mutations of the base table, those coming from base
table writes, or reads of the base table done as part of view updates
or view building.
The base table schema version of that data structure must match the
schema version of the mutation fragments, otherwise we hit undefined
behavior. This may include aborts, exceptions, segfaults, or data
corruption (e.g. writes landing in the wrong column in the view).
Before this patch, we could get schema version mismatch here after the
base table was altered. That's because the view schema did not change
when the base table was altered.
Another problem was that view building was using the current table's schema
to interpret the fragments and invoke view building. That's incorrect for two
reasons. First, fragments generated by a reader must be accessed only using
the reader's schema. Second, base_non_pk_columns_in_view_pk of the recorded
view ptrs may not longer match the current base table schema, which is used
to generate the view updates.
Part of the fix is to extract base_non_pk_columns_in_view_pk into a
third entity called base_dependent_view_info, which changes both on
base table schema changes and view schema changes.
It is managed by a shared pointer so that we can take immutable
snapshots of it, just like with schema_ptr. When starting the view
update, the base table schema_ptr and the corresponding
base_dependent_view_info have to match. So we must obtain them
atomically, and base_dependent_view_info cannot change during update.
Also, whenever the base table schema changes, we must update
base_dependent_view_infos of all attached views (atomically) so that
it matches the base table schema.
Fixes#7061.
Tests:
- unit (dev)
- [v1] manual (reproduced using scylla binary and cqlsh)
"
* tag 'mv-schema-mismatch-fix-v2' of github.com:tgrabiec/scylla:
db: view: Refactor view_info::initialize_base_dependent_fields()
tests: mv: Test dropping columns from base table
db: view: Fix incorrect schema access during view building after base table schema changes
schema: Call on_internal_error() when out of range id is passed to column_at()
db: views: Fix undefined behavior on base table schema changes
db: views: Introduce has_base_non_pk_columns_in_view_pk()
(cherry picked from commit 3daa49f098)
`trace_keyspace_helper::make_slow_query_mutation_data` expected a
"query" key in its parameters, which does not appear in case of
e.g. batches of prepared statements. This is example of failing
`record.parameters`:
```
...{"query[0]" : "INSERT INTO ks.tbl (pk, i) values (?, ?);"},
{"query[1]" : "INSERT INTO ks.tbl (pk, i) values (?, ?);"}...
```
In such case Scylla recorded no trace and said:
```
ERROR 2020-09-28 10:09:36,696 [shard 3] trace_keyspace_helper - No
"query" parameter set for a session requesting a slow_query_log record
```
Fix here is to leave query empty if not found. The users can still
retrieve the query contents from existing info.
Fixes#5843Closes#7293
(cherry picked from commit 0afa738a8f)
This series backports the evictable reader validation patchset (merged
as 97c99ea9f to master) to 4.1.
I only had to do changes to the tests.
Tests: unit(dev), some exception safety tests are failing with or
without my patchset
* https://github.com/denesb/scylla.git denesb/evictable-reader-validate-buffer/backport-4.1:
mutation_reader_test: add unit test for evictable reader self-validation
evictable_reader: validate buffer after recreation the underlying
evictable_reader: update_next_position(): only use peek'd position on partition boundary
mutation_reader_test: add unit test for evictable reader range tombstone trimming
evictable_reader: trim range tombstones to the read clustering range
position_in_partition_view: add position_in_partition_view before_key() overload
flat_mutation_reader: add buffer() accessor
Add both positive (where the validation should succeed) and negative
(where the validation should fail) tests, covering all validation cases.
(cherry picked from commit 076c27318b)
The reader recreation mechanism is a very delicate and error-prone one,
as proven by the countless bugs it had. Most of these bugs were related
to the recreated reader not continuing the read from the expected
position, inserting out-of-order fragments into the stream.
This patch adds a defense mechanism against such bugs by validating the
start position of the recreated reader. Several things are checked:
* The partition is the expected one -- the one we were in the middle of
or the next if we stopped at partition boundaries.
* The partition is in the read range.
* The first fragment in the partition is the expected one -- has a
an equal or larger position than the next expected fragment.
* The fragment is in the clustering range as defined by the slice.
As these validations are only done on the slow-path of recreating an
evicted reader, no performance impact is expected.
(cherry picked from commit 0b0ae18a14)
`evictable_reader::update_next_position()` is used to record the position the
reader will continue from, in the next buffer fill. This position is used to
create the partition slice when the underlying reader is evicted and has
to be recreated. There is an optimization in this method -- if the
underlying's buffer is not empty we peek at the first fragment in it and
use it as the next position. This is however problematic for buffer
validation on reader recreation (introduced in the next patch), because
using the next row's position as the next pos will allow for range
tombstones to be emitted with before_key(next_pos.key()), which will
trigger the validation. Instead of working around this, just drop this
optimization for mid-partition positions, it is inconsequential anyway.
We keep it for where it is important, when we detect that we are at a
partition boundary. In this case we can avoid reading the current
partition altogether when recreating the reader.
(cherry picked from commit 91020eef73)
Currently mutation sources are allowed to emit range tombstones that are
out-of the clustering read range if they are relevant to it. For example
a read of a clustering range [ck100, +inf), might start with:
range_tombstone{start={ck1, -1}, end={ck200, 1}},
clustering_row{ck100}
The range tombstone is relevant to the range and the first row of the
range so it is emitted as first, but its position (start) is outside the
read range. This is normally fine, but it poses a problem for evictable
reader. When the underlying reader is evicted and has to be recreated
from a certain clustering position, this results in out-of-order
mutation fragments being inserted into the middle of the stream. This is
not fine anymore as the monotonicity guarantee of the stream is
violated. The real solution would be to require all mutation sources to
trim range tombstones to their read range, but this is a lot of work.
Until that is done, as a workaround we do this trimming in the evictable
reader itself.
(cherry picked from commit 4f2e7a18e2)
Migration manager installs several feature change listeners:
if (this_shard_id() == 0) {
_feature_listeners.push_back(_feat.cluster_supports_view_virtual_columns().when_enabled(update_schema));
_feature_listeners.push_back(_feat.cluster_supports_digest_insensitive_to_expiry().when_enabled(update_schema));
_feature_listeners.push_back(_feat.cluster_supports_cdc().when_enabled(update_schema));
_feature_listeners.push_back(_feat.cluster_supports_per_table_partitioners().when_enabled(update_schema));
}
They will call update_schema_version_and_announce() when features are enabled, which does this:
return update_schema_version(proxy, features).then([] (utils::UUID uuid) {
return announce_schema_version(uuid);
});
So it first updates the schema version and then publishes it via
gossip in announce_schema_version(). It is possible that the
announce_schema_version() part of the first schema change will be
deferred and will execute after the other four calls to
update_schema_version_and_announce(). It will install the old schema
version in gossip instead of the more recent one.
The fix is to serialize schema digest calculation and publishing.
Fixes#7200
(cherry picked from commit 1a57d641d1)
This patch fixes a bug noted in issue #7218 - where PutItem operations
sometimes lose part of the item's data - some attributes were lost,
and the name of other attributes replaced by empty strings. The problem
happened when the write-isolation policy was LWT and there was contention
of writes to the same partition (not necessarily the same item).
To use CAS (a.k.a. LWT), Alternator builds an alternator::rmw_operation
object with an apply() function which takes the old contents of the item
(if needed) and a timestamp, and builds a mutation that the CAS should
apply. In the case of the PutItem operation, we wrongly assumed that apply()
will be called only once - so as an optimization the strings saved in the
put_item_operation were moved into the returned mutation. But this
optimization is wrong - when there is contention, apply() may be called
again when the changed proposed by the previous one was not accepted by
the Paxos protocol.
The fix is to change the one place where put_item_operation *moved* strings
out of the saved operations into the mutations, to be a copy. But to prevent
this sort of bug from reoccuring in future code, this patch enlists the
compiler to help us verify that it can't happen: The apply() function is
marked "const" - it can use the information in the operation to build the
mutation, but it can never modify this information or move things out of it,
so it will be fine to call this function twice.
The single output field that apply() does write (_return_attributes) is
marked "mutable" to allow the const apply() to write to it anyway. Because
apply() might be called twice, it is important that if some apply()
implementation sometimes sets _return_attributes, then it must always
set it (even if to the default, empty, value) on every call to apply().
The const apply() means that the compiler verfies for us that I didn't
forget to fix additional wrong std::move()s. Additionally, a test I wrote
to easily reproduce issue #7218 (which I will submit as a dtest later)
passes after this fix.
Fixes#7218.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200916064906.333420-1-nyh@scylladb.com>
(cherry picked from commit 5e8bdf6877)
test is currently flaky since system reads can happen
in the background and disturb the global row cache stats.
Use the table's row_cache stats instead.
Fixes#6773
Test: cql_query_test.test_cache_bypass(dev, debug)
Credit-to: Botond Dénes <bdenes@scylladb.com>
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200811140521.421813-1-bhalevy@scylladb.com>
(cherry picked from commit 6deba1d0b4)
There was a typo in get_column_defs_for_filtering(): it checked the
wrong pointer before dereferencing. Add a test exposing the NULL
dereference and fix the typo.
Tests: unit (dev)
Fixes#7198.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
(cherry picked from commit 9d02f10c71)
Consider an unpaged query that consumes all of available memory, despite
fea5067dfa which limits them (perhaps the
user raised the limit, or this is a system query). Eventually we will see a
bad_alloc which will abort the query and destroy this reconcilable_result_builder.
During destruction, we first destroy _memory_accounter, and then _result.
Destroying _memory_accounter resumes some continuations which can then
allocate memory synchronously when increasing the task queue to accomodate
them. We will then crash. Had we not crashed, we would immediately afterwards
release _result, freeing all the memory that we would ever need.
Fix by making _result the last member, so it is freed first.
Fixes#7240.
(cherry picked from commit 9421cfded4)
In commit 7d86a3b208 (storage_service:
Make replacing node take writes), application state of TOKENS of the
replacing node is added into gossip and propagated to the cluster after
the initial start of gossip service. This can cause a race below
1. The replacing node replaces the old dead node with the same ip address
2. The replacing node starts gossip without application state of the TOKENS
3. Other nodes in the cluster replace the application states of old dead node's
version with the new replacing node's version
4. replacing node dies
5. replace operation is performed again, the TOKENS application state is
not preset and replace operation fails.
To fix, we can always add TOKENS application state when the
gossip service starts.
Fixes: #7166
Backports: 4.1 and 4.2
(cherry picked from commit 3ba6e3d264)
"
This path set fixes stalls in repair that are caused by std::list merge and clear operations during test_latency_read_with_nemesis test.
Fixes#6940Fixes#6975Fixes#6976
"
* 'fix_repair_list_stall_merge_clear_v2' of github.com:asias/scylla:
repair: Fix stall in apply_rows_on_master_in_thread and apply_rows_on_follower
repair: Use clear_gently in get_sync_boundary to avoid stall
utils: Add clear_gently
repair: Use merge_to_gently to merge two lists
utils: Add merge_to_gently
(cherry picked from commit 4547949420)
We copy a list, which was reported to generate a 15ms stall.
This is easily fixed by moving it instead, which is safe since this is
the last use of the variable.
Fixes#7115.
(cherry picked from commit 6ff12b7f79)
After 8014c7124, cleanup can potentially pick a compacting SSTable.
Upgrade and scrub can also pick a compacting SSTable.
The problem is that table::candidates_for_compaction() was badly named.
It misleads the user into thinking that the SSTables returned are perfect
candidates for compaction, but manager still need to filter out the
compacting SSTables from the returned set. So it's being renamed.
When the same SSTable is compacted in parallel, the strategy invariant
can be broken like overlapping being introduced in LCS, and also
some deletion failures as more than one compaction process would try
to delete the same files.
Let's fix scrub, cleanup and ugprade by calling the manager function
which gets the correct candidates for compaction.
Fixes#6938.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200811200135.25421-1-raphaelsc@scylladb.com>
(cherry picked from commit 11df96718a)
Never trust Occam's Razor - it turns out that the use-after-free bug in the
"exists" command was caused by two separate bugs. We fixed one in commit
9636a33993, but there is a second one fixed in
this patch.
The problem fixed here was that a "service_permit" object, which is designed to
be copied around from place to place (it contains a shared pointer, so is cheap
to copy), was saved by reference, and the reference was to a function argument
and was destroyed prematurely.
This time I tested *many times* that that test_strings.py passes on both dev and
debug builds.
Note that test/run/redis still fails in a debug build, but due to a different
problem.
Fixes#6469
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200825183313.120331-1-nyh@scylladb.com>
(cherry picked from commit 868194cd17)
A missing "&" caused the key stored in a long-living command to be copied
and the copy quickly freed - and then used after freed.
This caused the test test_strings.py::test_exists_multiple_existent_key for
this feature to frequently crash.
Fixes#6469
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200823190141.88816-1-nyh@scylladb.com>
(cherry picked from commit 9636a33993)
The following stall was seen during a cleanup operation:
scylla: Reactor stalled for 16262 ms on shard 4.
| std::_MakeUniq<locator::tokens_iterator_impl>::__single_object std::make_unique<locator::tokens_iterator_impl, locator::tokens_iterator_impl&>(locator::tokens_iterator_impl&) at /usr/include/fmt/format.h:1158
| (inlined by) locator::token_metadata::tokens_iterator::tokens_iterator(locator::token_metadata::tokens_iterator const&) at ./locator/token_metadata.cc:1602
| locator::simple_strategy::calculate_natural_endpoints(dht::token const&, locator::token_metadata&) const at simple_strategy.cc:?
| (inlined by) locator::simple_strategy::calculate_natural_endpoints(dht::token const&, locator::token_metadata&) const at ./locator/simple_strategy.cc:56
| locator::abstract_replication_strategy::get_ranges(gms::inet_address, locator::token_metadata&) const at /usr/include/fmt/format.h:1158
| locator::abstract_replication_strategy::get_ranges(gms::inet_address) const at /usr/include/fmt/format.h:1158
| service::storage_service::get_ranges_for_endpoint(seastar::basic_sstring<char, unsigned int, 15u, true> const&, gms::inet_address const&) const at /usr/include/fmt/format.h:1158
| service::storage_service::get_local_ranges(seastar::basic_sstring<char, unsigned int, 15u, true> const&) const at /usr/include/fmt/format.h:1158
| (inlined by) operator() at ./sstables/compaction_manager.cc:691
| (inlined by) _M_invoke at /usr/include/c++/9/bits/std_function.h:286
| std::function<std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable> > > (table const&)>::operator()(table const&) const at /usr/include/fmt/format.h:1158
| (inlined by) compaction_manager::rewrite_sstables(table*, sstables::compaction_options, std::function<std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable> > > (table const&)>) at ./sstables/compaction_manager.cc:604
| compaction_manager::perform_cleanup(table*) at /usr/include/fmt/format.h:1158
To fix, we furturize the function to get local ranges and sstables.
In addition, this patch removes the dependency to global storage_service object.
Fixes#6662
(cherry picked from commit 07e253542d)
needs_cleanup() returns true if a sstable needs cleanup.
Turns out it's very slow because it iterates through all the local
ranges for all sstables in the set, making its complexity:
O(num_sstables * local_ranges)
We can optimize it by taking into account that abstract_replication_strategy
documents that get_ranges() will return a list of ranges that is sorted
and non-overlapping. Compaction for cleanup already takes advantage of that
when checking if a given partition can be actually purged.
So needs_cleanup() can be optimized into O(num_sstables * log(local_ranges)).
With num_sstables=1000, RF=3, then local_ranges=256(num_tokens)*3, it means
the max # of checks performed will go from 768000 to ~9584.
Fixes#6730.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200629171355.45118-2-raphaelsc@scylladb.com>
(cherry picked from commit cf352e7c14)
Add a version that runs inside a seastar thread. The benefit is that
get_ranges can yield to avoid stalls.
Refs #6662
(cherry picked from commit 94995acedb)
1. The node1 is shutdown
2. The node1 sends shutdown message to node2
3. The node2 receives gossip shutdown message but the handler yields
4. The node1 is restarted
5. The node1 sends new gossip endpoint_state to node2, node2 applies the state
in apply_state_locally and calls gossiper::handle_major_state_change
and then calls gossiper::mark_alive
6. The shutdown message handler in step 3 resumes and sets status of node1 to SHUTDOWN
7. The gossiper::mark_alive fiber in step 5 resumes and calls gossiper::real_mark_alive,
node2 will skip to mark node1 as alive because the status of node1 is
SHUTDOWN. As a result, node1 is alive but it is not marked as UP by node2.
To fix, we serialize the two operations.
Fixes#7032
(cherry picked from commit e6ceec1685)
The test/alternator/run script creates a temporary directory for the Scylla
database in /tmp. The assumption was that this is the fastest disk (usually
even a ramdisk) on the test machine, and we didn't need anything else from
it.
But it turns out that on some systems, /tmp is actually a slow disk, so
this patch adds a way to configure the temporary directory - if the TMPDIR
environment variable exists, it is used instead of /tmp. As before this
patch, a temporary subdirectry is created in $TMPDIR, and this subdirectory
is automatically deleted when the test ends.
The test.py script already passes an appropriate TMPDIR (testlog/$mode),
which after this patch the Alternator test will use instead of /tmp.
Fixes#6750
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200713193023.788634-1-nyh@scylladb.com>
(cherry picked from commit 8e3be5e7d6)
We implemented the order operators (LT, GT, LE, GE, BETWEEN) incorrectly
for binary attributes: DynamoDB requires that the bytes be treated as
unsigned for the purpose of order (so byte 128 is higher than 127), but
our implementation uses Scylla's "bytes" type which has signed bytes.
The solution is simple - we can continue to use the "bytes" type, but
we need to use its compare_unsigned() function, not its "<" operator.
This bug affected conditional operations ("Expected" and
"ConditionExpression") and also filters ("QueryFilter", "ScanFilter",
"FilterExpression"). The bug did *not* affect Query's key conditions
("KeyConditions", "KeyConditionExpression") because those already
used Scylla's key comparison functions - which correctly compare binary
blobs as unsigned bytes (in fact, this is why we have the
compare_unsigned() function).
The patch also adds tests that reproduce the bugs in conditional
operations, and show that the bug did not exist in key conditions.
Fixes#6573
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200603084257.394136-1-nyh@scylladb.com>
(cherry picked from commit f6b1f45d69)
Manually removed tests in test_key_conditions.py that did not exist in this branch
"
There are 5 services, that register their RPC handlers in messaging
service, but quite a few of them unregister them on stop.
Unregistering is somewhat critical, not just because it makes the
code look clean, but also because unregistration does wait for the
message processing to complete, thus avoiding use-after-free's in
the handlers.
In particular, several handlers call service::get_schema_for_write()
which, in turn, may end up in service::maybe_sync() calling for
the local migration manager instance. All those handlers' processing
must be waited for before stopping the migration manager.
The set brings the RPC handlers unregistration in sync with the
registration part.
tests: unit (dev)
dtest (dev: simple_boot_shutdown, repair)
start-stop by hands (dev)
fixes: #6904
"
* 'br-rpc-unregister-verbs' of https://github.com/xemul/scylla:
main: Add missing calls to unregister RPC hanlers
messaging: Add missing per-service unregistering methods
messaging: Add missing handlers unregistration helpers
streaming: Do not use db->invoke_on_all in vain
storage_proxy: Detach rpc unregistration from stop
main: Shorten call to storage_proxy::init_messaging_service
(cherry picked from commit 01b838e291)
A check, to validate that counter column cannot be added into non-counter table,
is missing for alter table statement. Validation is performed when building new
schema, but it's limited to checking that a schema will not contain both counter
and non-counter columns.
Due to lack of validation, the added counter column could be incorrectly
persisted to the schema, but this results in a crash when setting the new
schema to its table. On restart, it can be confirmed that the schema change
was indeed persisted when describing the table.
This problem is fixed by doing proper validation for the alter table statement,
which consists of making sure a new counter column cannot be added to a
non-counter table.
The test cdc_disallow_cdc_for_counters_test is adjusted because one of its tests
was built on the assumption that counter column can be added into a non-counter
table.
Fixes#7065.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200824155709.34743-1-raphaelsc@scylladb.com>
(cherry picked from commit 1c29f0a43d)
Since older binutils on some distribution does not able to handle
compressed debuginfo generated on Fedora, we need to disable it.
However, debian packager force debuginfo compression since debian/compat = 9,
we have to uncompress them after compressed automatically.
Fixes#6982
(cherry picked from commit 75c2362c95)
The `shard` parameter of `find_db()` is optional and is defaulted to
`None`. When missing, the current shard's database instance is returned.
The problem is that the if condition checking this uses `not shard`,
which also evaluates to `True` if `shard == 0`, resulting in returning
the current shard's database instance for shard 0. Change the condition
to `shard is None` to avoid this.
Fixes: #7016
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200812091546.1704016-1-bdenes@scylladb.com>
(cherry picked from commit 4cfab59eb1)
"
This series backports the series "repair: row_level: prevent deadlocks
when repairing homogenous nodes" (merged as a9c7a1a86) to branch-4.1.
"
Fixes#6272
* 'repair-row-level-evictable-local-reader/branch-4.1' of https://github.com/denesb/scylla:
repair: row_level: destroy reader on EOS or error
repair: row_level: use evictable_reader for local reads
mutation_reader: expose evictable_reader
mutation_reader: evictable_reader: add auto_pause flag
mutation_reader: make evictable_reader a flat_mutation_reader
mutation_reader: s/inactive_shard_read/inactive_evictable_reader/
mutation_reader: move inactive_shard_reader code up
mutation_reader: fix indentation
mutation_reader: shard_reader: extract remote_reader as evictable_reader
mutation_reader: reader_lifecycle_policy: make semaphore() available early
fea83f6 introduced a race between processing (and hence removing)
sstables from `_sstables_with_tables` and registering new ones. This
manifested in sstables that were added concurrently with processing a
batch for the same sstables being dropped and the semaphore units
associated with them not returned. This resulted in repairs being
blocked indefinitely as the units of the semaphore were effectively
leaked.
This patch fixes this by moving the contents of `_sstables_with_tables`
to a local variable before starting the processing. A unit test
reproducing the problem is also added.
Fixes: #6892
Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200817160913.2296444-1-bdenes@scylladb.com>
(cherry picked from commit 22a6493716)
To avoid having to make it an optional with all the additional checks,
we just replace it with an empty reader instead, this also also achieves
the desired effect of releasing the read permit and all the associated
resources early.
(cherry picked from commit fbbc86e18c)
Row level repair, when using a local reader, is prone to deadlocking on
the streaming reader concurrency semaphore. This has been observed to
happen with at least two participating nodes, running more concurrent
repairs than the maximum allowed amount of reads by the concurrency
semaphore. In this situation, it is possible that two repair instances,
competing for the last available permits on both nodes, get a permit on
one of the nodes and get queued on the other one respectively. As
neither will let go of the permit it already acquired, nor give up
waiting on the failed-to-acquired permit, a deadlock happens.
To prevent this, we make the local repair reader evictable. For this we
reuse the newly exposed evictable reader.
The repair reader is paused after the repair buffer is filled, which is
currently 32MB, so the cost of a possible reader recreation is amortized
over 32MB read.
The repair reader is said to be local, when it can use the shard-local
partitioner. This is the case if the participating nodes are homogenous
(their shard configuration is identical), that is the repair instance
has to read just from one shard. A non-local reader uses the multishard
reader, which already makes its shard readers evictable and hence is not
prone to the deadlock described here.
(cherry picked from commit 080f00b99a)
Expose functions for the outside world to create evictable readers. We
expose two functions, which create an evictable reader with
`auto_pause::yes` and `auto_pause::no` respectively. The function
creating the latter also returns a handle in addition to the reader,
which can be used to pause the reader.
(cherry picked from commit 542d9c3711)
Currently the evictable reader unconditionally pauses the underlying
reader after each use (`fill_buffer()` or `fast_forward_to()` call).
This is fine for current users (the multishard reader), but the future
user we are doing all this refactoring for -- repair -- will want to
control when the underlying reader is paused "manually". Both these
behaviours can easily be supported in a single implementation, so we
add an `auto_pause` flag to allow the creator of the evictable reader
to control this.
(cherry picked from commit 1cc31deff9)
The `evictable_reader` class is almost a proper flat mutation reader
already, it roughly offers the same interface. This patch makes this
formal: changing the class to inherit from `flat_mutation_reader::impl`,
and implement all virtual methods. This also entails a departure from
using the lifecycle policy to pause/resume and create readers, instead
using more general building blocks like the reader concurrency semaphore
and a mutation source.
(cherry picked from commit af9e1c23e1)
Rename `inactive_shard_read` to `inactive_evictable_reader` to reflect
that the fact that the evictable reader is going to be of general use,
not specific to the multishard reader.
(cherry picked from commit 4485864ada)
We want to make the evictable reader mechanism used in the multishard
reader pipeline available for general (re)use, as a standalone
flat mutation reader implementation. The first step is extracting
`shard_reader::remote_reader` the class implementing this logic into a
top-level class, also renamed to `evictable_reader`.
(cherry picked from commit f9d1916499)
Currently all reader lifecycle policy implementations assume that
`semaphore()` will only be called after at least one call to
`make_reader()`. This assumption will soon not hold, so make sure
`semaphore()` can be called at any time, including before any calls are
made to `make_reader()`.
(cherry picked from commit 63309f925c)
Currently we assign the reference to the vector of selected sstables to
`auto sst`. This makes a copy and we pass this local variable to
`do_for_each()`, which will result in a use-after-free if the latter
defers.
Fix by not making a copy and instead just keep the reference.
Fixes: #7060
Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200818091241.2341332-1-bdenes@scylladb.com>
(cherry picked from commit 78f94ba36a)
Fixes#6561
Pre-image generation in row deletion case only checked if we had a pre-image
result set row. But that can be from post-image. Also check actual existance
of the pre-image CK.
Message-Id: <20200608132804.23541-1-calle@scylladb.com>
(cherry picked from commit 5105e9f5e1)
In one of the longevity tests, we observed 1.3s reactor stall which came from
repair_meta::get_full_row_hashes_source_op. It traced back to a call to
std::unordered_set::insert() which triggered big memory allocation and
reclaim.
I measured std::unordered_set, absl::flat_hash_set, absl::node_hash_set
and absl::btree_set. The absl::btree_set was the only one that seastar
oversized allocation checker did not warn in my tests where around 300K
repair hashes were inserted into the container.
- unordered_set:
hash_sets=295634, time=333029199 ns
- flat_hash_set:
hash_sets=295634, time=312484711 ns
- node_hash_set:
hash_sets=295634, time=346195835 ns
- btree_set:
hash_sets=295634, time=341379801 ns
The btree_set is a bit slower than unordered_set but it does not have
huge memory allocation. I do not measure real difference of total time
to finish repair of the same dataset with unordered_set and btree_set.
To fix, switch to absl btree_set container.
Fixes#6190
(cherry picked from commit 67f6da6466)
(cherry picked from commit a27188886a)
It is a pity we have to list so many libraries, but abseil doesn't
provide a .pc file.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 7d1f6725dd)
Ref #6190.
This adds the https://abseil.io library as a submodule. The patch
series that follows needs a hash table that supports heterogeneous
lookup, and abseil has a really good hash table that supports that
(https://abseil.io/blog/20180927-swisstables).
The library is still not available in Fedora, but it is fairly easy to
use it directly from a submodule.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 383a9c6da9)
Ref #6190
The variable seastar_cflags was being used for flags passed to seastar
and for flags extracted from the seastar.pc file.
This introduces a new variable for the flags extracted from the
seastar.pc file.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 2ad09aefb6)
Ref #6190.
Fixes#6995
In c2c6c71 the assert on replay positions in flushed sstables discarded by
truncate was broken, by the fact that we no longer flush all sstables
unless auto snapshot is enabled.
This means the low_mark assertion does not hold, because we maybe/probably
never got around to creating the sstables that would hold said mark.
Note that the (old) change to not create sstables and then just delete
them is in itself good. But in that case we should not try to verify
the rp mark.
(cherry picked from commit 9620755c7f)
"
When commitlog is recreated in hints manager, only shutdown() method is
called, but not release(). Because of that, some internal commitlog
objects (`segment_manager` and `segment`s) may be left pointing to each
other through shared_ptr reference cycles, which may result in memory
leak when the parent commitlog object is destroyed.
This PR prevents memory leaks that may happen this way by calling
release() after shutdown() from the hints manager.
Fixes: #6409, Fixes#6776
"
* piodul-fix-commitlog-memory-leak-in-hinted-handoff:
hinted handoff: disable warnings about segments left on disk
hinted handoff: release memory on commitlog termination
(cherry picked from commit 4c221855a1)
The column names in SlicePredicate can be passed in arbitrary order.
We converted them to clustering ranges in read_command preserving the
original order. As a result, the clustering ranges in read command may
appear out of order. This violates storage engine's assumptions and
lead to undefined behavior.
It was seen manifesting as a SIGSEGV or an abort in sstable reader
when executing a get_slice() thrift verb:
scylla: sstables/consumer.hh:476: seastar::future<> data_consumer::continuous_data_consumer<StateProcessor>::fast_forward_to(size_t, size_t) [with StateProcessor = sstables::data_consume_rows_context_m; size_t = long unsigned int]: Assertion `end >= _stream_position.position' failed.
Fixes#6486.
Tests:
- added a new dtest to thrift_tests.py which reproduces the problem
Message-Id: <1596725657-15802-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit bfd129cffe)
The "NULL" operator in Expected (old-style conditional operations) doesn't
have any parameters, so we insisted that the AttributeValueList be empty.
However, we forgot to allow it to also be missing - a possibility which
DynamoDB allows.
This patch adds a test to reproduce this case (the test passes on DyanmoDB,
fails on Alternator before this patch, and succeeds after this patch), and
a fix.
Fixes#6816.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200709161254.618755-1-nyh@scylladb.com>
(cherry picked from commit f549d147ea)
On some CLI tools, command options may different between latest version
vs older version.
To maximize compatibility of setup scripts, we should always use
relocatable CLI tools instead of distribution version of the tool.
Related #6954
(cherry picked from commit a19a62e6f6)
For collections and UDTs the `MIN()` and `MAX()` functions are
generated on the fly. Until now they worked by comparing just the
byte representations of arguments.
This patch uses specific per-type comparators to provide semantically
sensible, dynamically created aggregates.
Fixes#6768
(cherry picked from commit 5b438e79be)
Fixes#6828
When using the scylla list index from UUID extension,
null values were not handled properly causing throws
from underlying layer.
(cherry picked from commit 3b74b9585f)
On CentOS7, systemd does not support percentage-based parameter.
To apply memory parameter on CentOS7, we need to override the parameter
in bytes, instead of percentage.
Fixes#6783
(cherry picked from commit 3a25e7285b)
The mutation object may be freed prematurely during commitlog replay
in the schema upgrading path. We will hit the problem if the memtable
is full and apply_in_memory() needs to defer.
This will typically manifest as a segfault.
Fixes#6953
Introduced in 79935df
Tests:
- manual using scylla binary. Reproduced the problem then verified the fix makes it go away
Message-Id: <1596044010-27296-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit 3486eba1ce)
Debian package builds provide a root environment for the installation
scripts, since that's what typical installation scripts expect. To
avoid providing actual root, a "fakeroot" system is used where syscalls
are intercepted and any effect that requires root (like chown) is emulated.
However, fakeroot sporadically fails for us, aborting the package build.
Since our install scripts don't really require root (when operating in
the --packaging mode), we can just tell dpkg-buildpackage that we don't
need fakeroot. This ought to fix the sporadic failures.
As a side effect, package builds are faster.
Fixes#6655.
(cherry picked from commit b608af870b)
On GCE, /dev/sda14 reported as unused disk but it's BIOS boot partition,
should not use for scylla data partition, also cannot use for it since it's
too small.
It's better to exclude such partiotion from unsed disk list.
Fixes#6636
(cherry picked from commit d7de9518fe)
We saw scylla hit user after free in repair with the following procedure during tests:
- n1 and n2 in the cluster
- n2 ran decommission
- n2 sent data to n1 using repair
- n2 was killed forcely
- n1 tried to remove repair_meta for n1
- n1 hit use after free on repair_meta object
This was what happened on n1:
1) data was received -> do_apply_rows was called -> yield before create_writer() was called
2) repair_meta::stop() was called -> wait_for_writer_done() / do_wait_for_writer_done was called
with _writer_done[node_idx] not engaged
3) step 1 resumed, create_writer() was called and _repair_writer object was referenced
4) repair_meta::stop() finished, repair_meta object and its member _repair_writer was destroyed
5) The fiber created by create_writer() at step 3 hit use after free on _repair_writer object
To fix, we should call wait_for_writer_done() after any pending
operations were done which were protected by repair_meta::_gate. This
prevents wait for writer done finishes before the writer is in the
process of being created.
Fixes: #6853Fixes: #6868
Backports: 4.0, 4.1, 4.2
(cherry picked from commit e6f640441a)
Turns out the fix f591c9c710 wasn't enough to make sure all input streams
are properly closed on failure.
It only closes the main input stream that belongs to context, but it misses
all the input streams that can be opened in the consumer for promote index
reading. Consumer stores a list of indexes, where each of them has its own
input stream. On failure, we need to make sure that every single one of
them is properly closed before destroying the indexes as that could cause
memory corruption due to read ahead.
Fixes#6924.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200727182214.377140-1-raphaelsc@scylladb.com>
(cherry picked from commit 0d70efa58e)
In some cases estimated number of partitions can be 0, which is albeit a
legit estimation result, breaks many low-level sstable writer code, so
some of these have assertions to ensure estimated partitions is > 0.
To avoid hitting this assert all users of the sstable writers do the
clamping, to ensure estimated partitions is at least 1. However leaving
this to the callers is error prone as #6913 has shown it. As this
clamping is standard practice, it is better to do it in the writers
themselves, avoiding this problem altogether. This is exactly what this
patch does. It also adds two unit tests, one that reproduces the crash
in #6913, and another one that ensures all sstable writers are fine with
estimated partitions being 0 now. Call sites previously doing the
clamping are changed to not do it, it is unnecessary now as the writer
does it itself.
Fixes#6913
Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200724120227.267184-1-bdenes@scylladb.com>
[avi: adjust sstable_datafile_test's use of compaction_descriptor and make_permit]
(cherry picked from commit fe127a2155)
from Botond.
Recently it was observed (#6603) that since 4e6400293ea, the staging
reader is reading from a lot of sstables (200+). This consumes a lot of
memory, and after this reaches a certain threshold -- the entire memory
amount of the streaming reader concurrency semaphore -- it can cause a
deadlock within the view update generation. To reduce this memory usage,
we exploit the fact that the staging sstables are usually disjoint, and
use the partitioned sstable set to create the staging reader. This
should ensure that only the minimum number of sstable readers will be
opened at any time.
Refs: #6603Fixes: #6707
Tests: unit(dev)
* 'view-update-generator-use-partitioned-set/v1' of https://github.com/denesb/scylla:
db/view: view_update_generator: use partitioned sstable set
sstables: make_partitioned_sstable_set(): return an sstable_set
(cherry picked from commit e4b74356bb)
Staging SSTables can be incorrectly added or removed from the backlog tracker,
after an ALTER TABLE or TRUNCATE, because the add and removal don't take
into account if the SSTable requires view building, so a Staging SSTable can
be added to the tracker after a ALTER table, or removed after a TRUNCATE,
even though not added previously, potentially causing the backlog to
become negative.
Fixes#6798.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200716180737.944269-1-raphaelsc@scylladb.com>
(cherry picked from commit b67066cae2)
In case a row hash conflict, a hash in set_diff will get more than one
row from get_row_diff.
For example,
Node1 (Repair master):
row1 -> hash1
row2 -> hash2
row3 -> hash3
row3' -> hash3
Node2 (Repair follower):
row1 -> hash1
row2 -> hash2
We will have set_diff = {hash3} between node1 and node2, while
get_row_diff({hash3}) will return two rows: row3 and row3'. And the
error below was observed:
repair - Got error in row level repair: std::runtime_error
(row_diff.size() != set_diff.size())
In this case, node1 should send both row3 and row3' to peer node
instead of fail the whole repair. Because node2 does not have row3 or
row3', otherwise node1 won't send row with hash3 to node1 in the first
place.
Refs: #6252
(cherry picked from commit a00ab8688f)
* seastar 78f626af6c...c9c1dc5fa7 (2):
> futures: Add a test for a broken promise in a parallel_for_each
> future: Call set_to_broken_promise earlier
Fixes#6749 (probably).
We could hit "cannot serialize '_io.BufferedReader' object" when request get 404 error from the server
Now you will get legit error message in the case.
Fixes#6690
(cherry picked from commit de82b3efae)
WHERE clauses with start point above the end point were handled
incorrectly. When the slice bounds are transformed to interval
bounds, the resulting interval is interpreted as wrap-around (because
start > end), so it contains all values above 0 and all values below
0. This is clearly incorrect, as the user's intent was to filter out
all possible values of a.
Fix it by explicitly short-circuiting to false when start > end. Add
a test case.
Fixes#5799.
Tests: unit (dev)
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
(cherry picked from commit 921dbd0978)
Counter update is a RMW operation. Until now the "Read" part was
not guarded by a timeout, which is changed in this patch.
Fixes#5069
(cherry picked from commit e04fd9f774)
We already have a docker image option to enable alternator on an unencrypted
port, "--alternator-port", but we forgot to also allow the similar option
for enabling alternator on an encrypted (HTTPS) port: "--alternator-https-port"
so this patch adds the missing option, and documents how to use it.
Note that using this option is not enough. When this option is used,
Alternator also requires two files, /etc/scylla/scylla.crt and
/etc/scylla/scylla.key, to be inserted into the image. These files should
contain the SSL certificate, and key, respectively. If these files are
missing, you will get an error in the log about the missing file.
Fixes#6583.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200621125219.12274-1-nyh@scylladb.com>
(cherry picked from commit e4eca5211a)
When a token is calculated for stream_id, we check that the key is
exactly 16 bytes long. If it's not - `minimum_token` is returned
and client receives empty result.
This used to be the expected behavior for empty keys; now it's
extended to keys of any incorrect length.
Fixes#6570
(cherry picked from commit 8628ede009)
After commit 7d86a3b208 (storage_service:
Make replacing node take writes), during replace operation, tokens in
_token_metadata for node being replaced are updated only after the replace
operation is finished. As a result, in range_streamer::add_ranges, the
node being replaced will be considered as a source to stream data from.
Before commit 7d86a3b208, the node being
replaced will not be considered as a source node because it is already
replaced by the replacing node before the replace operation is finished.
This is the reason why it works in the past.
To fix, filter out the node being replaced as a source node explicitly.
Tests: replace_first_boot_test and replace_stopped_node_test
Backports: 4.1
Fixes: #6728
(cherry picked from commit e338028b7e22b0a80be7f80c337c52f958bfe1d7)
SSTable upgrade is requiring 2x the space of input SSTables because
we aren't releasing references of the SSTables that were already
upgraded. So if we're upgrading 1TB, it means that up to 2TB may be
required for the upgrade operation to succeed.
That can be fixed by moving all input SSTables when rewrite_sstables()
asks for the set of SSTables to be compacted, so allowing their space
to be released as soon as there is no longer any ref to them.
Spotted while auditting code.
Fixes#6682.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200619205701.92891-1-raphaelsc@scylladb.com>
(cherry picked from commit 52180f91d4)
"
Before this series scylla would effectively infinite loop when, for
example, casting a decimal with a negative scale to float.
Fixes#6720
"
* 'espindola/fix-decimal-issue' of https://github.com/espindola/scylla:
big_decimal: Add a test for a corner case
big_decimal: Correctly handle negative scales
big_decimal: Add a as_rational member function
big_decimal: Move constructors out of line
(cherry picked from commit 3e2eeec83a)
Updating tags was erroneously done locally, which means that
the schema change was not propagated to other nodes.
The new code announces new schema globally.
Fixes#6513
Branches: 4.0,4.1
Tests: unit(dev)
dtest(alternator_tests.AlternatorTest.test_update_condition_expression_and_write_isolation)
Message-Id: <3a816c4ecc33c03af4f36e51b11f195c231e7ce1.1592935039.git.sarna@scylladb.com>
(cherry picked from commit f4e8cfe03b)
"
After "Make replacing node take writes" series, with repair based node
operations disabled, we saw the replace operation fail like:
```
[shard 0] init - Startup failed: std::runtime_error (unable to find
sufficient sources for streaming range (9203926935651910749, +inf) in
keyspace system_auth)
```
The reason is the system_auth keyspace has default RF of 1. It is
impossible to find a source node to stream from for the ranges owned by
the replaced node.
In the past, the replace operation with keyspace of RF 1 passes, because
the replacing node calls token_metadata.update_normal_tokens(tokens,
ip_of_replacing_node) before streaming. We saw:
```
[shard 0] range_streamer - Bootstrap : keyspace system_auth range
(-9021954492552185543, -9016289150131785593] exists on {127.0.0.6}
```
Node 127.0.0.6 is the replacing node 127.0.0.5. The source node check in
range_streamer::get_range_fetch_map will pass if the source is the node
itself. However, it will not stream from the node itself. As a result,
the system_auth keyspace will not get any data.
After the "Make replacing node take writes" series, the replacing node
calls token_metadata.update_normal_tokens(tokens, ip_of_replacing_node)
after the streaming finishes. We saw:
```
[shard 0] range_streamer - Bootstrap : keyspace system_auth range
(-9049647518073030406, -9048297455405660225] exists on {127.0.0.5}
```
Since 127.0.0.5 was dead, the source node check failed, so the bootstrap
operation.
Ta fix, we ignore the table of RF 1 when it is unable to find a source
node to stream.
Fixes#6351
"
* asias-fix_bootstrap_with_rf_one_in_range_streamer:
range_streamer: Handle table of RF 1 in get_range_fetch_map
streaming: Use separate streaming reason for replace operation
(cherry picked from commit 9afd599d7c)
Current sender sends stream_mutation_fragments_cmd::end_of_stream to
receiver when an error is received from a peer node. To be safe, send
stream_mutation_fragments_cmd::error instead of
stream_mutation_fragments_cmd::end_of_stream to prevent end_of_stream to
be written into the sstable when a partition is not closed yet.
In addition, use mutation_fragment_stream_validator to valid the
mutation fragments emitted from the reader, e.g., check if
partition_start and partition_end are paired when the reader is done. If
not, fail the stream session and send
stream_mutation_fragments_cmd::error instead of
stream_mutation_fragments_cmd::end_of_stream to isolate the problematic
sstables on the sender node.
Refs: #6478
(cherry picked from commit a521c429e1)
LWT batches conditions can't span multiple tables.
This was detected in batch_statement::validate() called in ::prepare().
But ::cas_result_set_metadata() was built in the constructor,
causing a bitset assert/crash in a reported scenario.
This patch moves validate() to the constructor before building metadata.
Closes#6332
Tested with https://github.com/scylladb/scylla-dtest/pull/1465
[avi: adjust spelling of exception message to 4.1 spelling]
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
(cherry picked from commit d1521e6721)
The get token range API can become big which can cause large allocation
and stalls.
This patch replace the implementation so it would stream the results
using the http stream capabilities instead of serialization and sending
one big buffer.
Fixes#6297
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 7c4562d532)
Even if there are no attributes to return from PutItem requests,
we should return a valid JSON object, not an empty string.
Fixes#6568
Tests: unit(dev)
(cherry picked from commit 8fc3ca855e)
Client libraries (e.g. PynamoDB) expect the UnprocessedKeys
and UnprocessedItems attributes to appear in the response
unconditionally - it's hereby added, along with a simple test case.
Fixes#6569
Tests: unit(dev)
(cherry picked from commit 3aff52f56e)
This is relevant only when using partition or clustering keys which
have a representation in memory which is larger than 12.8 KB (10% of
LSA segment size).
There are several places in code (cache, background garbage
collection) which may need to linearize keys because of performing key
comparison, but it's not done safely:
1) the code does not run with the LSA region locked, so pointers may
get invalidated on linearization if it needs to reclaim memory. This
is fixed by running the code inside an allocating section.
2) LSA region is locked, but the scope of
with_linearized_managed_bytes() encloses the allocating section. If
allocating section needs to reclaim, linearization context will
contain invalidated pointers. The fix is to reorder the scopes so
that linearization context lives within an allocating section.
Example of 1 can be found in
range_populating_reader::handle_end_of_stream() where it performs a
lookup:
auto prev = std::prev(it);
if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
it->set_continuous(true);
but handle_end_of_stream() is not invoked under allocating section.
Example of 2 can be found in mutation_cleaner_impl::merge_some() where
it does:
return with_linearized_managed_bytes([&] {
...
return _worker_state->alloc_section(region, [&] {
Fixes#6637.
Refs #6108.
Tests:
- unit (all)
Message-Id: <1592218544-9435-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit e81fc1f095)
When a replacing node is in early boot up and is not in HIBERNATE sate
yet, if the node is killed by a user, the node will wrongly send a
shutdown message to other nodes. This is because UNKNOWN is not in
SILENT_SHUTDOWN_STATES, so in gossiper::do_stop_gossiping, the node will
send shutdown message. Other nodes in the cluster will call
storage_service::handle_state_normal for this node, since NORMAL and
SHUTDOWN status share the same status handler. As a result, other nodes
will incorrectly think the node is part of the cluster and the replace
operation is finished.
Such problem was seen in replace_node_no_hibernate_state_test dtest:
n1, n2 are in the cluster
n2 is dead
n3 is started to replace n2, but n3 is killed in the middle
n3 announces SHUTDOWN status wrongly
n1 runs storage_service::handle_state_normal for n3
n1 get tokens for n3 which is empty, because n3 hasn't gossip tokens yet
n1 skips update normal tokens for n3, but think n3 has replaced n2
n4 starts to replace n2
n4 checks the tokens for n2 in storage_service::join_token_ring (Cannot
replace token {} which does not exist!) or
storage_service::prepare_replacement_info (Cannot replace_address {}
because it doesn't exist in gossip)
To fix, we add UNKNOWN into SILENT_SHUTDOWN_STATES and avoid sending
shutdown message.
Tests: replace_address_test.py:TestReplaceAddress.replace_node_no_hibernate_state_test
Fixes: #6436
(cherry picked from commit dddde33512)
Commit 968177da04 has changed the schema
of cdc_topology_description and cdc_description tables in the
system_distributed keyspace.
Unfortunately this was a backwards-incompatible change: these tables
would always be created, irrespective of whether or not "experimental"
was enabled. They just wouldn't be populated with experimental=off.
If the user now tries to upgrade Scylla from a version before this change
to a version after this change, it will work as long as CDC is protected
b the experimental flag and the flag is off.
However, if we drop the flag, or if the user turns experimental on,
weird things will happen, such as nodes refusing to start because they
try to populate cdc_topology_description while assuming a different schema
for this table.
The simplest fix for this problem is to rename the tables. This fix must
get merged in before CDC goes out of experimental.
If the user upgrades his cluster from a pre-rename version, he will simply
have two garbage tables that he is free to delete after upgrading.
sstables and digests need to be regenerated for schema_digest_test since
this commit effectively adds new tables to the system_distributed keyspace.
This doesn't result in schema disagreement because the table is
announced to all nodes through the migration manager.
(cherry picked from commit d89b7a0548)
Fixes#6537.
GC writer, used for incremental compaction, cannot be currently used if interposer
consumer is used. That's because compaction assumes that GC writer will be operated
only by a single compaction writer at a given point in time.
With interposer consumer, multiple writers will concurrently operate on the same
GC writer, leading to race condition which potentially result in use-after-free.
Let's disable GC writer if interposer consumer is enabled. We're not losing anything
because GC writer is currently only needed on strategies which don't implement an
interposer consumer. Resharding will always disable GC writer, which is the expected
behavior because it doesn't support incremental compaction yet.
The proper fix, which allows GC writer and interposer consumer to work together,
will require more time to implement and test, and for that reason, I am postponing
it as #6472 is a showstopper for the current release.
Fixes#6472.
tests: mode(dev).
[Raphael: Fixed compilation failure in unit test test_bug_6472 for backport]
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Reviewed-by: Glauber Costa <glauber@scylladb.com>
(cherry picked from commit 097a5e9e07)
Message-Id: <20200610203928.86717-1-raphaelsc@scylladb.com>
In 28c3d4 `out()` was used without `shell=True` and was the spliting of arguments
failed cause of the complex commands in the cmd (pipe and such)
Fixes#6159
(cherry picked from commit a2bb48f44b)
We generate a coredump as part of "scylla_coredump_setup" to verify that
coredumps are working. However, we need to *remove* that test coredump
to avoid people and test infrastructure reporting those coredumps.
Fixes#6159
(cherry picked from commit 28c3d4f8e8)
Currently we use a systemd mount (var-lib-systemd-coredump.mount) to mount
default coredump directory (/var/lib/systemd/coredump) to
(/var/lib/scylla/coredump). The /var/lib/scylla had been mounted to a big
storage, so we will have enough space for coredump after the mount.
Currently in coredump_setup, we only enabled var-lib-systemd-coredump.mount,
but not start it. The directory won't be mounted after coredump_setup, so the
coredump will still be saved to default coredump directory.
The mount will only effect after reboot.
Fixes#6566
(cherry picked from commit abf246f6e5)
This reverts commit e77dad3adf because its
incorrect.
Amos explains:
"Quote from https://www.freedesktop.org/software/systemd/man/systemd.mount.html
What=
Takes an absolute path of a device node, file or other resource to
mount. See mount(8) for details. If this refers to a device node, a
dependency on the respective device unit is automatically created.
Where=
Takes an absolute path of a file or directory for the mount point; in
particular, the destination cannot be a symbolic link. If the mount
point does not exist at the time of mounting, it is created as
directory.
So the mount point is '/var/lib/systemd/coredump' and
'/var/lib/scylla/coredump' is the file to mount, because /var/lib/scylla
had mounted a second big storage, which has enough space for Huge
coredumps.
Bentsi or other touched problem with old scylla-master AMI, a coredump
occurred but not successfully saved to disk for enospc. The directory
/var/lib/systemd/coredump wasn't mounted to /var/lib/scylla/coredump.
They WRONGLY thought the wrong mount was caused by the config problem,
so he posted a fix.
Actually scylla-ami-setup / coredump wasn't executed on that AMI, err:
unit scylla-ami-setup.service not found Because
'scylla-ami-setup.service' config file doesn't exist or is invalid.
Details of my testing: https://github.com/scylladb/scylla/issues/6300#issuecomment-637324507
So we need to revert Bentsi's patch, it changed the right config to wrong."
(cherry picked from commit 9d9d54c804)
Our parsing of values in a KeyConditions paramter of Query was done naively.
As a result, we got bizarre error messages "condition not met: false" when
these values had incorrect type (this is issue #6490). Worse - the naive
conversion did not decode base64-encoded bytes value as needed, so
KeyConditions on bytes-typed keys did not work at all.
This patch fixes these bugs by using our existing utility function
get_key_from_typed_value(), which takes care of throwing sensible errors
when types don't match, and decoding base64 as needed.
Unfortunately, we didn't have test coverage for many of the KeyConditions
features including bytes keys, which is why this issue escaped detection.
A patch will follow with much more comprehensive tests for KeyConditions,
which also reproduce this issue and verify that it is fixed.
Refs #6490Fixes#6495
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200524141800.104950-1-nyh@scylladb.com>
(cherry picked from commit 6b38126a8f)
Alternator supports four ways in which write operations can use quorum
writes or LWT or both, which we called "write isolation policies".
Until this patch, Alternator defaulted to the most generally safe policy,
"always_use_lwt". This default could have been overriden for each table
separately, but there was no way to change this default for all tables.
This patch adds a "--alternator-write-isolation" configuration option which
allows changing the default.
Moreover, @dorlaor asked that users must *explicitly* choose this default
mode, and not get "always_use_lwt" without noticing. The previous default,
"always_use_lwt" supports any workload correctly but because it uses LWT
for all writes it may be disappointingly slow for users who run write-only
workloads (including most benchmarks) - such users might find the slow
writes so disappointing that they will drop Scylla. Conversely, a default
of "forbid_rmw" will be faster and still correct, but will fail on workloads
which need read-modify-write operations - and suprise users that need these
operations. So Dor asked that that *none* of the write modes be made the
default, and users must make an informed choice between the different write
modes, rather than being disappointed by a default choice they weren't
aware of.
So after this patch, Scylla refuses to boot if Alternator is enabled but
a "--alternator-write-isolation" option is missing.
The patch also modifies the relevant documentation, adds the same option to
our docker image, and the modifies the test-running script
test/alternator/run to run Scylla with the old default mode (always_use_lwt),
which we need because we want to test RMW operations as well.
Fixes#6452
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200524160338.108417-1-nyh@scylladb.com>
(cherry picked from commit c3da9f2bd4)
In order to be sure that all nodes acknowledged that a table was
created, the CreateTable request will now only return after
seeing that schema agreement was reached.
Rationale: alternator users check if the table was created by issuing
a DescribeTable request, and assume that the table was correctly
created if it returns nonempty results. However, our current
implementation of DescribeTable returns local results, which is
not enough to judge if all the other nodes acknowledge the new table.
CQL drivers are reported to always wait for schema agreement after
issuing DDL-changing requests, so there should be no harm in waiting
a little longer for alternator's CreateTable as well.
Fixes#6361
Tests: alternator(local)
(cherry picked from commit 5f2eadce09)
The existing text did not explain what happens if additional DCs are added
to the cluster, so this patch improves the explanation of the status of
our support for global tables, including that issue.
Fixes#6353
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200513175908.21642-1-nyh@scylladb.com>
(cherry picked from commit f3fd976120)
In write_end_of_stream, it does:
1) Write write_partition_end
2) Write empty mutation_fragment_opt
If 1) fails, 2) will be skipped, the consumer of the queue will wait for
the empty mutation_fragment_opt forever.
Found this issue when injecting random exceptions between 1) and 2).
Refs #6272
Refs #6248
(cherry picked from commit b744dba75a)
The implementation of get_range_to_address_map has a default behaviour,
when getting an empty keypsace, it uses the first non-system keyspace
(first here is basically, just a keyspace).
The current implementation has two issues, first, it uses a reference to
a string that is held on a stack of another function. In other word,
there's a use after free that is not clear why we never hit.
The second, it calls get_non_system_keyspaces twice. Though this is not
a bug, it's redundant (get_non_system_keyspaces uses a loop, so calling
that function does have a cost).
This patch solves both issues, by chaning the implementation to hold a
string instead of a reference to a string.
Second, it stores the results from get_non_system_keyspaces and reuse
them it's more efficient and holds the returned values on the local
stack.
Fixes#6465
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 69a46d4179)
When the 'forbid_rmw' write isolation policy is selected, read-modify-write
are intentionally forbidden. The error message in this case used to say:
"Read-modify-write operations not supported"
Which can lead users to believe that this operation isn't supported by this
version of Alternator - instead of realizing that this is in fact a
configurable choice.
So in this patch we just change the error message to say:
"Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information."
Fixes#6421.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200518125538.8347-1-nyh@scylladb.com>
(cherry picked from commit 5ef9854e86)
When index file is larger than 4GB, offset calculation will overflow
uint32_t and _promoted_index_end will be too small.
As a result, promoted_index_size calculation will underflow and the
rest of the page will be interpretd as a promoted index.
The partitions which are in the remainder of the index page will not
be found by single-partition queries.
Data is not lost.
Introduced in 6c5f8e0eda.
Fixes#6040
Message-Id: <20200521174822.8350-1-tgrabiec@scylladb.com>
(cherry picked from commit a6c87a7b9e)
In a recent next failure I got the following backtrace
function=function@entry=0x270360 "seastar::rpc::sink_impl<Serializer, Out>::~sink_impl() [with Serializer = netw::serializer; Out = {repair_row_on_wire_with_cmd}]") at assert.c:101
at ./seastar/include/seastar/core/shared_ptr.hh:463
at repair/row_level.cc:2059
This patch changes a few functions to use finally to make sure the sink
is always closed.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200515202803.60020-1-espindola@scylladb.com>
(cherry picked from commit 311fbe2f0a)
Ref #6414
Consider: n1, n2, n1 is the repair master, n2 is the repair follower.
=== Case 1 ===
1) n1 sends missing rows {r1, r2} to n2
2) n2 runs apply_rows_on_follower to apply rows, e.g., {r1, r2}, r1
is written to sstable, r2 is not written yet, r1 belongs to
partition 1, r2 belongs to partition 2. It yields after row r1 is
written.
data: partition_start, r1
3) n1 sends repair_row_level_stop to n2 because error has happened on n1
4) n2 calls wait_for_writer_done() which in turn calls write_end_of_stream()
data: partition_start, r1, partition_end
5) Step 2 resumes to apply the rows.
data: partition_start, r1, partition_end, partition_end, partition_start, r2
=== Case 2 ===
1) n1 sends missing rows {r1, r2} to n2
2) n2 runs apply_rows_on_follower to apply rows, e.g., {r1, r2}, r1
is written to sstable, r2 is not written yet, r1 belongs to partition
1, r2 belongs to partition 2. It yields after partition_start for r2
is written but before _partition_opened is set to true.
data: partition_start, r1, partition_end, partition_start
3) n1 sends repair_row_level_stop to n2 because error has happened on n1
4) n2 calls wait_for_writer_done() which in turn calls write_end_of_stream().
Since _partition_opened[node_idx] is false, partition_end is skipped,
end_of_stream is written.
data: partition_start, r1, partition_end, partition_start, end_of_stream
This causes unbalanced partition_start and partition_end in the stream
written to sstables.
To fix, serialize the write_end_of_stream and apply_rows with a semaphore.
Fixes: #6394Fixes: #6296Fixes: #6414
(cherry picked from commit b2c4d9fdbc)
When sending hints from one file, rps_set field in send_one_file_ctx
keeps track of commitlog positions of hints that are being currently
sent, or have failed to be sent. At the end of the operation, if sending
of some hints failed, we will choose position of the earliest hint that
failed to be sent, and will retry sending that file later, starting from
that position. This position is stored in _last_not_complete_rp.
Usually, this set has a bounded size, because we impose a limit of at
most 128 hints being sent concurrently. Because we do not attempt to
send any more hints after a failure is detected, rps_set should not have
more than 128 elements at a time.
Due to a bug, commitlog positions of old hints (older than
gc_grace_seconds of the destination table) were inserted into rps_set
but not removed after checking their age. This could cause rps_set to
grow very large when replaying a file with old hints.
Moreover, if the file mixed expired and non-expired hints (which could
happen if it had hints to two tables with different gc_grace_seconds),
and sending of some non-expired hints failed, then positions of expired
hints could influence calculation _last_not_complete_rp, and more hints
than necessary would be resent on the next retry.
This simple patch removes commitlog position of a hint from rps_set when
it is detected to be too old.
Fixes#6422
(cherry picked from commit 85d5c3d5ee)
Related commit: 85d5c3d
When attempting to send a hint, an exception might occur that results in
that hint being discarded (e.g. keyspace or table of the hint was
removed).
When such an exception is thrown, position of the hint will already be
stored in rps_set. We are only allowed to retain positions of hints that
failed to be sent and needed to be retried later. Dropping a hint is not
an error, therefore its position should be removed from rps_set - but
current logic does not do that.
Because of that bug, hint files with many discardable hints might cause
rps_set to grow large when the file is replayed. Furthermore, leaving
positions of such hints in rps_set might cause more hints than necessary
to be re-sent if some non-discarded hints fail to be sent.
This commit fixes the problem by removing positions of discarded hints
from rps_set.
Fixes#6433
(cherry picked from commit 0c5ac0da98)
The "current compatibility with DynamoDB" section in alternator.md is where
we should list very briefly our state of compatibility - it's not the right
place to explain implementation details or track obscure bugs. I've
significantly shortened the "Tags" section because, in brief, we do
fully support tags and should say that we do.
I moved the two bugs mentioned in the text into the bug tracker:
Refs #6389
Refs #6391
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200507125022.22608-1-nyh@scylladb.com>
The patch implements:
- /storage_service/auto_compaction API endpoint
- /column_family/autocompaction/{name} API endpoint
Those APIs allow to control and request the status of background
compaction jobs for the existing tables.
The implementation introduces the table::_compaction_disabled_by_user.
Then the CompactionManager checks if it can push the background
compaction job for the corresponding table.
New members
===
table::enable_auto_compaction();
table::disable_auto_compaction();
bool table::is_auto_compaction_disabled_by_user() const
Test
===
Tests: unit(sstable_datafile_test autocompaction_control_test), manual
$ ninja build/dev/test/boost/sstable_datafile_test
$ ./build/dev/test/boost/sstable_datafile_test --run_test=autocompaction_control_test -- -c1 -m2G --overprovisioned --unsafe-bypass-fsync 1 --blocked-reactor-notify-ms 2000000
The test tries to submit a compaction job after playing
with autocompaction control table switch. However, there is
no reliable way to hook pending compaction task. The code
assumed that with_scheduling_group() closure will never
preempt execution of the stats check.
Revert
===
Reverts commit c8247ac. In previous version the execution
sometimes resulted into the following error:
test/boost/sstable_datafile_test.cc(1076): fatal error: in "autocompaction_control_test":
critical check cm->get_stats().pending_tasks == 1 || cm->get_stats().active_tasks == 1 has failed
This version adds a few sstables to the cf, starts
the compaction and awaits until it is finished.
API change
===
- `/column_family/autocompaction/` always returned `true` while answering to the question: if the autocompaction disabled (see https://github.com/scylladb/scylla-jmx/blob/master/src/main/java/org/apache/cassandra/db/ColumnFamilyStore.java#L321). now it answers to the question: if the autocompaction for specific table is enabled. The question logic is inverted. The patch to the JMX is required. However, the change is decent because all old values were invalid (it always reported all compactions are disabled).
- `/column_family/autocompaction/` got support for POST/DELETE per table
Fixes
===
Fixes#1488Fixes#1808Fixes#440
Signed-off-by: Ivan Prisyazhnyy <ivan@scylladb.com>
Reviewed-by: Glauber Costa <glauber@scylladb.com>
Alternator supports four different write isolation policies, the default
being to do all the writes with LWT, but these policies were only briefly
explained in alternator.md.
This patch significantly expands on this explanation, better explaining
the tradeoffs involved in these four options, and when each might make
sense (if at all).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200506235152.18190-1-nyh@scylladb.com>
The single-key sstable reader uses the clustering ranges from the slice
to determine the upper bound of the disk read-range using the index.
For this is simply uses the end bound of the last clustering ranges. For
reverse reads however the clustering ranges in the slice are in reverse
order, so this will in fact be the upper bound of the smallest range.
Depending on whether the distance between the clustering range is big
enough for the sstable reader to use the index to skip between them,
this will lead to either reading too little data or an assert failure.
This patch fixes the problematic function `get_slice_upper_bound()` to
consider reverse reads as well.
Initially I thought there will be more mishandling of reverse slices,
but actually `mutation_fragment_filter`, the component doing the actual
slicing of rows, is already reverse-slice aware.
A unit test which reproduces the assert failure is also added.
Fixes: #6171
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200507114956.271799-1-bdenes@scylladb.com>
"
Garbage collected SSTables, created by incremental compaction process,
are being added to the SSTable set using a function that invalidates
row cache using the range of the SSTable itself. That's incorrect
because data in GC SSTables come from preexisting SSTables in set,
meaning the state of data isn't changed and so no need for
invalidation at all. Incorrect invalidation like this is a source of
read performance issues. This problem is fixed by including GC
SSTables to the descriptor which is used to specify changes to the
SSTable set, which is the correct thing to do given that a midway
failure could leave the set in an incorrect state.
Fixes#5956.
Fixes#6275.
tests: unit(dev)
"
* 'fix_issue_5956_v4' of github.com:raphaelsc/scylla:
sstables/compaction: Don't invalidate row cache when adding GC SSTable to SSTable set
sstables/compaction: Change meaning of compaction_completion_desc input and output fields
sstables/compaction: Clean up code around garbage_collected_sstable_writer
The shutdown process of compaction manager starts with an explicit call
from the database object. However that can only happen everything is
already initialized. This works well today, but I am soon to change
the resharding process to operate before the node is fully ready.
One can still stop the database in this case, but reshardings will
have to finish before the abort signal is processed.
This patch passes the existing abort source to the construction of the
compaction_manager and subscribes to it. If the abort source is
triggered, the compaction manager will react to it firing and all
compactions it manages will be stopped.
We still want the database object to be able to wait for the compaction
manager, since the database is the object that owns the lifetime of
the compaction manager. To make that possible we'll use a future
that is return from stop(): no matter what triggered the abort, either
an early abort during initial resharding or a database-level event like
drain, everything will shut down in the right order.
The abort source is passed to the database, who is responsible from
constructing the compaction manager.
Tests: unit (dev), manual start+stop, manual drain + stop
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200506184749.98288-1-glauber@scylladb.com>
This is unrelated to counters, but happens to fix#4209
`tuple::delayed_value::contains_bind_marker` used to check that
ALL terms are bound (not that ANY of them is bound). As a result,
scylla would crash in prepare codepath for collections of tuples.
After this fix `invalid_request_exception` is thrown instead.
* jul-stas-4209-crash-on-counter-shards-set:
boost/tests: test for bound variable in a list of tuple literals
cql3: fix detection of bound variables in tuples
So that nested exceptions are not lost. Also, marshal exceptions, the
ones we have in these places, already have a backtrace, so might as well
use that, instead of creating a new one, loosing unwound frames.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200507091405.244544-1-bdenes@scylladb.com>
Add a couple of cql tests regarding conditional batches:
1. Verify that "delete" takes priority over "insert"
when applied to the same row within the same batch.
2. Test that a workaround for the issue works as expected (i.e.
delete only individual cells instead of the full record).
Tests: unit(dev)
Fixes: #6273
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200506201200.176590-1-pa.solodovnikov@scylladb.com>
`tuple::delayed_value::contains_bind_marker` used to check that
ALL terms are bound (not that ANY of them is bound). As a result,
scylla would crash in prepare codepath for collections. After this
fix `invalid_request_exception` is thrown instead.
Fixes#4209
We must unregister the monitor upon destruction to prevent use-after-free
from `compaction_backlog_tracker::backlog` path.
This is similar to ~compaction_read_monitor as implemented
in commit ca284174d0Fixes#6385
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200506214419.569655-1-bhalevy@scylladb.com>
In commit da3bf20e71 we supposedly enabled
support for Cassandra's "start_native_transport" option which can be set to
0 to run Scylla without listening on the CQL port. This can be useful, for
example, if a user only want the DynamoDB or Redis APIs but not CQL.
Unfortunately, the option was still marked "Unused", so it wasn't really
enabled as a valid command line option. This patch fixes that, and
documents the start_native_transport option in docs/protocols.md, where
we document the different protocols, ports, and options to configure them.
Fixes#6387.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200506174850.13616-1-nyh@scylladb.com>
fmt, the formatting library we use, detects types with conversion
to std::string_view (and formats them as strings) and types that
support operator<<(std::ostream, const T&) (and performs custom
formatting on them). However, if <fmt/ostream.h>, the latter is
not done.
The problem happens with seastar::sstring, which implements both,
and debug mode, which disables inlining. Some translation units
do include <fmt/ostream.h>, and so generate code to do custom
formatting. exception_utils.cc doesn't, and so generates code
to format via string_view conversion. At link time, the
compiler picks one of the generated functions and includes it
in the final binary; it happened to pick one generated outside
exception_utils.cc, using custom formatting.
However, there is also code in fmt to encode which path fmt
chose - string_view or custom. This code is constexpr and so
is evaluated in exception_utils.cc. The result is that the
function to perform formatting of seastar::sstring uses custom
formatting, while the descriptor containing the method used
says it is formatting via string_view. This is enough to cause
a crash.
The problem is limited to debug mode, since in other modes
all this code is inlined, and so is consistent within the
translation unit.
We need a more general fix (hopefully in fmt), but for now a
simple fix is to add the missing include.
Ref https://github.com/fmtlib/fmt/issues/1662
"
CDC has to create CDC streams that are co-located with corresponding BaseTable data. This is not always easy. Especially for small vnodes. This PR introduces new partitioner which allows us to easily find such stream ids that the stream belongs to a given vnode and shard.
The idea is that a partitioner accepts only keys that are a blob composed of two int64 numbers. The first number is the token of the key.
Tests: unit(dev), dtests(CDC)
"
* haaawk-cdc_partitioner:
cdc:use CDCPartitioner for CDC Log
dht: Add find_first_token_for_shard
dht: use long_token in token::to_int64
cdc: add CDCPartitioner
stream_id: add token_from_bytes static function
i_partitioner: Stop distinguishing whether keys order is preserved
Using shared_ptr's in `unrecognized_entity_exception` can lead
to cross-cpu deletion of a pointer which will trigger an assert
`_cpu == std::this_thread::get_id()' when shared_ptr is disposed.
Copy `column_identifier` to the exception object and avoid using
an instance of `cql3::relation`: just get a string representation
from it since nothing more is used in associated exception
handling code.
Fixes: #6287
Tests: unit(dev, debug), dtest(lwt_destructive_ddl_test.py:LwtDestructiveDDLTest.test_rename_column)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200506155714.150497-1-pa.solodovnikov@scylladb.com>
When generating view updates, an endpoint can appear both
as a primary paired endpoint for the view update, and as a pending
endpoint (due to range movements). In order not to generate
the same update twice for the same endpoint, the paired endpoint
is removed from the list of pending endpoints if present.
Fixes#5459
Tests: unit(dev),
dtest(TestMaterializedViews.add_dc_during_mv_insert_test)
Following up on 91b71a0b1a
We also need to serialize storage_service::true_snapshots_size
with snapshot-modifying operations.
It seems like it was assumed that get_snapshot_details
is done under run_snapshot_list_operation, but the one called
here is the table method, not the api::storage_service::get_snapshot_details.
Fixes#5603
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200506115732.483966-1-bhalevy@scylladb.com>
Both `cql3::column_condition` and `cql3::column_condition::raw`
classes are marked as `final`: it's safe to use lw_shared_ptr
instead of generic `seastar::shared_ptr`.
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200428202249.82785-1-pa.solodovnikov@scylladb.com>
This patch adds a comprehensive, hopefully complete, test for the
yet-unimplemented FilterExpression feature. FilterExpression is the
modern syntax which allows filtering the results of Query and Scan requests.
The patch includes 50 tests spanning more than 700 lines of code,
testing (hopefully) all the various FilterExpression features,
sub-cases, syntax peculiarities, and so on.
As usual, all included tests pass when run against DynamoDB
("pytest --aws") and xfail when run against Scylla.
This test should be helpful to understand how to implement
FilterExpression correctly, as well as test the future implementation.
Refs #5038.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200503165639.15320-1-nyh@scylladb.com>
We often have to examine raw values, obtained from various sources, like
sstables, logs and coredumps. For some types it is quite simple to
convert raw hex values to human readable ones manually (integers), for
others it is very hard or simply not practical. This command-line tool
aims to ease working with raw values, by providing facilities to print
them in human readable form and compare them. We can extend it with more
functions as needed.
Examples:
$ scylla_types -a print -t Int32Type b34b62d4
-1286905132
$ scylla_types -a compare -t 'ReversedType(TimeUUIDType)' b34b62d46a8d11ea0000005000237906 d00819896f6b11ea00000000001c571b
b34b62d4-6a8d-11ea-0000-005000237906 > d0081989-6f6b-11ea-0000-0000001c571b
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200505124914.104827-1-bdenes@scylladb.com>
* seastar 3c2e27811...e708d1df3 (10):
> Merge "Fix a few issues found by clang's asan" from Rafael
> seastar: app_template: allow a description to be provided for the app
> membarrier: fix madvise(MADV_DONTNEED) failure and crash with --lock-memory
Fixes#6346
> rpc::compressor: Fix static init fiasco with names
> fair_queue: express all internal fair_queue quantities as fair_queue_tickets
> net: remove API v1 compatibility layer (variadic future in networking)
> testing: Move parts of the exchanger out of line
> on_internal_error: add overload taking an std::exception_ptr
> tuple_utils: Add a missing include
> Merge "Fix use of uninitialized found by valgrind" from Rafael
Garbage collected SSTable is incorrectly added to SSTable set with a function
that invalidates row cache. This problem is fixed by adding GC SStable
to set using mechanism which replaces old sstables with new sstables.
Also, adding GC SSTable to set in a separate call is not correct.
We should make sure that GC SSTable reaches the SSTable set at the same time
its respective old (input) SSTable is removed from the set, and that's done
using a single request call to table.
Fixes#5956.
Fixes#6275.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
input_sstables is renamed to old_sstables and is about old SSTables that should be
deleted and removed from the SSTable set.
output_sstables is renamed to new_sstables and is about new SSTable that should be
added to the SSTable set, replacing the old ones.
This will allow us, for example, to add auxiliary SSTables to SSTable set using
the same call which replaces output SSTables by input SSTables in compaction.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
This cleanup allows us to get rid of the ugly compaction::create_new_sstable(),
and reduce complexity by getting rid of observable.
garbage_collected_sstable_writer::data is introduced to allow compaction to
directly communicate with the GC writer, which is stored in mutation_compaction,
making it unreachable after the compaction has started. By making compaction
store GC writer's data and using that same data to create g__c__s__w,
compaction is able to communicate with GC writer without the complexity of
observable utility. This move is important for the subsequent work which
will fix a couple of issues regarding management of GC SSTables.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Alternator server used to print a startup log line for each shard,
which is redundant and creates churn for nodes with many cores.
Instead of all that, a single line is now printed once alternator
server properly boots.
Fixes#6347
Tests: manual(boot), unit(dev)
Currently the following scenario may happen:
Consider 3 nodes A, B and C and a LWT failed write operation that
managed to get V accepted on A. The value is read twice. First read
access B and C and returns nothing. Next one access A and B, notices
failed round and completes it. Returns value V. Since two consequent
reads without any writes in the middle return different value this
breaks linearisability.
This happens because read does not do full paxos round. The patch
makes read code to reuse the same logic as write by writing a dummy
value which ensures that complete paxos round is used.
Currently the following scenario may happen:
Consider 3 nodes A, B and C and a LWT failed write operation that
managed to get V accepted on A. Next operation may be conditioned on a
value been V, but it may access nodes B and C first and fail. Retrying
the same operation without any writes in the middle may now access A
and B and succeed since it will notice V and will complete previous
transaction. Having to different outcome for the same operation without
any writes in the middle breaks linearisability.
This happens because when condition is unmet we abandon the paxos round,
so this patch makes us complete it with empty value. Now if first
conditional write after failure access B and C it will write accepted
ballot there with the value greater than one of V and V will no longer be
replayed ever.
Change the way query result is passed from getting a reference to a
result to getting a foreign_ptr<lw_shared_ptr<query::result>>. This will
allow cas_request to keep it without copying.
Currently, the test seems to use the tmpdir class in a wrong way,
just to get a path to a temporary directory.
It should keep the tmpdir object around for the duration of the test
so the temporary directory will be automatically removed when the test
completes.
Refs #6344
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200504153810.202218-1-bhalevy@scylladb.com>
Maximum item depth accepted by DynamoDB is 32, and alternator
chose 39 as its arbitrary value in order to provide 7 shining
new levels absolutely free of charge. Unfortunately, our code
which checks the nesting level in rapidjson parsing bumps
the counter by 2 for every object, which is due to rapidjson's
internal implementation. In order to actually support
at least 32 levels, the threshold is simply doubled.
This commit comes with a test case which ensures that
32-nested items are accepted both by alternator and DynamoDB.
The test case failed for alternator before the fix.
Fixes#6366
Tests: unit(dev), alternator(local, remote)
Right now the compaction_manager needs to be started explicitly.
We may change it in the future, but right now that's how it is.
Everything works now even without it, because compaction_manager::stop
happens to work even if it was not started. But it is technically
illegal.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200504143048.17201-1-glauber@scylladb.com>
The issue is that the mount is /var/lib/scylla/coredump ->
/var/lib/systemd/coredump. But we need to do the opposite in order to
save the coredump on the partition that Scylla is using:
/var/lib/systemd/coredump-> /var/lib/scylla/coredump
Fixes#6301
"
Fixes#6067
Makes the scylla endpoint initializations that support TLS use reloadable certificate stores, watching used cert + key files for changes, and reload iff modified.
Tests in separate dtest set.
"
* elcallio-calle/reloadable-tls:
transport: Use reloadable tls certificates
redis: Use reloadable tls certificates
alternator: Use reloadable tls certificates
messaging_service: Use reloadable TLS certificates
Changes messaging service rpc to use reloadable tls
certificates iff tls is enabled-
Note that this means that the service cannot start
listening at construction time if TLS is active,
and user need to call start_listen_ex to initialize
and actually start the service.
Since "normal" messaging service is actually started
from gms, this route too is made a continuation.
std::gmtime() has a sad property of using a global static buffer
for returning its value. This is not thread-safe, so its usage
is replaced with gmtime_r, which can accept a local buffer.
While no regressions where observed in this particular area of code,
a similar bug caused failures in alternator, so it's better to simply
replace all std::gmtime calls with their thread-safe counterpart.
Message-Id: <39e91c74de95f8313e6bb0b12114bf12c0e79519.1588589151.git.sarna@scylladb.com>
Add EX option for SET command, to set TTL for the key.
A behavior of SET EX is same as SETEX command, it just different syntax.
see: https://redis.io/commands/set
Scylla returns the wrong error code (0000 - server internal error)
in response to trying to do authentication/authorization operations
that involves a non-existing role.
This commit changes those cases to return error code 2200 (invalid
query) which is the correct one and also the one that Cassandra
returns.
Tests:
Unit tests (Dev)
All auth and auth_role dtests
The compaction_manager test lives inside a thread and it is not taking
advantage of it, with continuations all over.
One of the side effects of it is that the test is calling stop() twice
on the compaction_manager. While this works today, it is not good
practice. A change I am making is just about to break it.
This patch converts the test to fully use .get() instead of chained
continuations and in doing so also guarantees that the compaction
manager will be RAII-stopped just one, from a defer object.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200503161420.8346-2-glauber@scylladb.com>
Local system tables from `system` namespace use LocalStrategy
replication, so they do not need to be concerned about gc grace
period. Some system tables already set gc grace period to 0,
but other ones, including system.large_partitions, did not.
That may result in millions of tombstones being needlessly
kept for these tables, which can cause read timeouts.
Fixes#6325
Tests: unit(dev), local(running cqlsh and playing with system tables)
"
Intrusive containers often have references between containers elements
that point to some non-first word of the element. This references
currently fly below the radar of `scylla find` and `scylla
generate-object-graph`, as they are looking to references to only the
first word of the objects. So objects that are members of an intrusive
container often appear to have no inbound references at all.
This patch-set improves support for finding such references by looking
for references to non-first words of objects.
It also includes some generic, minor improvements to scylla
generate_object_graph.
"
* 'scylla-gdb.py-scylla-generate-object-graph-linked-lists/v1' of https://github.com/denesb/scylla:
scylla-gdb.py: scylla generate_object_graph: make label of initial vertice bold
scylla-gdb.py: scylla generate_object_graph: remove redundant lookup
scylla-gdb.py: scylla generate_object_graph: print "to" offsets
scylla-gdb.py: scylla generate-object-graph: use value-range to find references
scylla-gdb.py: scylla find: allow finding ranges of values
scylla-gdb.py: find_in_live(): return pointer_metadata instances
Loading SSTables from the main directory is possible, to be compatible with
Cassandra, but extremely dangerous and not recommended.
From the beginning, we recommend using an separate, upload/ directory.
In all this time, perhaps due to how the feature's usefulness is reduced
in Cassandra due to the possible races, I have never seen anyone coming
from Cassandra doing procedures involving refresh at all.
Loading SSTables from the main directory forces us to disable writes to
the table temporarily until the SSTables are sorted out. If we get rid of
this, we can get rid of the disabling of the writes as well.
We can't do it now because if we want to be nice to the odd user that may
be using refresh through the main directory without our knowledge we should
at least error out.
This patch, then, does that: it errors out if SSTables are found in the main
directory. It will not proceed with the refresh, and direct the user to the
upload directory.
The main loop in reshuffle_sstables is left in place structurally for now, but
most of it is gone. The test for is is deleted.
After a period of deprecation we can start ignoring these SSTables and get rid
of the lock.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200429144511.13681-1-glauber@scylladb.com>
When using interposers, cancelling compactions can leave futures
that are not waited for (resharding, twcs)
The reason is when consume_end_of_stream gets called, it tries to
push end_of_stream into the queue_reader_handle. Because cancelling
a compaction is done through an exception, the queue_reader_handle
is terminated already at this time. Trying to push to it generates
another exception and prevents us from returning the future right
below it.
This patch adds a new method is_terminated() and if we detect
that the queue_reader_handle is already terminated by this point,
we don't try to push. We call it is_terminated() because the check
is to see if the queue_reader_handle has a _reader. The reader is
also set to null on successful destruction.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200430175839.8292-1-glauber@scylladb.com>
Background:
Replace operation is used to replace a dead node in the cluster.
Currently during replace operation, the replacing node does not take any
writes. As a result, new writes to a range after the sync for that range
is done, e.g., after streaming for that range is finished, will not be
synced to the replacing node. Hinted hand off or repair after the
replacing operation will help. But it is better if we can make the
writes to the replacing node to avoid any post replacing operation
actions.
After this series and repair based node operation series, the replace
operation will guarantee the replacing node has all the latest copy of
data including the new writes during the replace operation. In short, no
more repairs before or after the replacing operation. Just replacing the
node is enough.
Implementation:
Filter the node being replaced out of the natural endpoints in
storage_proxy, so that:
The node being replaced will not be selected as the target for
normal write or normal read.
Do not depend on the gossip liveness to avoid selecting replacing node
for normal write or normal read when the replacing node has the same
ip address as the node being replaced. No more special handling for
hibernate state in gossip which makes it is simpler and more robust.
Replacing node will be marked as UP.
Put the replacing node in the pending list, so that:
Replacing node will take writes but write to replacing will not be
counted as CL.
Replacing node will not take normal read.
Example:
For example, with RF = 3, n1, n2, n3 in the cluster, n3 is dead and
being replaced by node n4. When n4 starts:
writes to nodes {n1, n2, n3} are changed to
normal_replica_writes = {n1, n2} and pending_replica_writes= {n4}.
reads to nodes {n1, n2, n3} are changed to
normal_replica_reads = {n1, n2} only.
This way, the replacing node n4 now takes writes but does not take reads.
Tests:
Measure the number of writes during pending period that is the
replacing starts and finishes the replace operation.
Start 5 nodes, n1 to n5.
Stop n5
Start write in the background
Start n6 to replace n5
Get scylla_database_total_writes metrics when the replacing node announces HIBERNATE (replacing) and NORMAL status.
Before:
2020-02-06 08:35:35.921837 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:35:35.939493 scylla_database_total_writes: node1={'scylla_database_total_writes': 15483}
2020-02-06 08:35:35.950614 scylla_database_total_writes: node2={'scylla_database_total_writes': 15857}
2020-02-06 08:35:35.961820 scylla_database_total_writes: node3={'scylla_database_total_writes': 16195}
2020-02-06 08:35:35.978427 scylla_database_total_writes: node4={'scylla_database_total_writes': 15764}
2020-02-06 08:35:35.992580 scylla_database_total_writes: node6={'scylla_database_total_writes': 331}
2020-02-06 08:36:49.794790 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:36:49.809189 scylla_database_total_writes: node1={'scylla_database_total_writes': 267088}
2020-02-06 08:36:49.823302 scylla_database_total_writes: node2={'scylla_database_total_writes': 272352}
2020-02-06 08:36:49.837228 scylla_database_total_writes: node3={'scylla_database_total_writes': 274004}
2020-02-06 08:36:49.851104 scylla_database_total_writes: node4={'scylla_database_total_writes': 262972}
2020-02-06 08:36:49.862504 scylla_database_total_writes: node6={'scylla_database_total_writes': 513}
Writes = 513 - 331
After:
2020-02-06 08:28:56.548047 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:28:56.560813 scylla_database_total_writes: node1={'scylla_database_total_writes': 290886}
2020-02-06 08:28:56.573925 scylla_database_total_writes: node2={'scylla_database_total_writes': 310304}
2020-02-06 08:28:56.586305 scylla_database_total_writes: node3={'scylla_database_total_writes': 304049}
2020-02-06 08:28:56.601464 scylla_database_total_writes: node4={'scylla_database_total_writes': 303770}
2020-02-06 08:28:56.615066 scylla_database_total_writes: node6={'scylla_database_total_writes': 604}
2020-02-06 08:29:10.537016 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:29:10.553257 scylla_database_total_writes: node1={'scylla_database_total_writes': 336126}
2020-02-06 08:29:10.567181 scylla_database_total_writes: node2={'scylla_database_total_writes': 358549}
2020-02-06 08:29:10.581939 scylla_database_total_writes: node3={'scylla_database_total_writes': 351416}
2020-02-06 08:29:10.595567 scylla_database_total_writes: node4={'scylla_database_total_writes': 350580}
2020-02-06 08:29:10.610548 scylla_database_total_writes: node6={'scylla_database_total_writes': 45460}
Writes = 45460 - 604
As we can see the replacing node did not take write before and take write after the patch.
Check log of writer handler in storage_proxy
storage_proxy - creating write handler for token: -2642068240672386521,
keyspace_name=ks, original_natrual={127.0.0.1, 127.0.0.5, 127.0.0.2},
natural={127.0.0.1, 127.0.0.2}, pending={127.0.0.6}
The node being replaced, n5=127.0.0.5, is filtered out and the replacing
node, n6=127.0.0.6 is in the pending list.
* asias/replace_take_writes:
storage_service: Make replacing node take writes
repair: Use token_metadata with the replacing node in do_rebuild_replace_with_repair
abstract_replication_strategy: Add get_ranges which takes token_metadata
abstract_replication_strategy: Add get_natural_endpoints_without_node_being_replaced
abstract_replication_strategy: Add allow_remove_node_being_replaced_from_natural_endpoints
token_metadata: Calculate pending ranges for replacing node
storage_service: Unify handling of replaced node removal from gossip
storage_service: Update tokens and replace address for replace operation
The .get_ep_stat(ep) call can throw when registering metrics (we have
issue for it, #5697). This is not expected by it callers, in particular
abstract_write_response_handler::timeout_cb breaks in the middle and
doesn't call the on_timeout() and the _proxy->remove_response_handler(),
which results in not removed and not released responce handler. In turn
not released response handler doesn't set the _ready future on which
response_wait() waits -> stuck.
Although the issue with .get_ep_stat() should be fixed, an exception in
it mustn't lead to deadlocks, so the fix is to make the get_ep_stat()
noexcept by catching the exception and returning a dummy stat object
instead to let caller(s) finish.
Fixes#5985
Tests: unit(dev)
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200430163639.5242-1-xemul@scylladb.com>
"
This series fix hang in multishard_writer when error happens. It contains
- multishard_writer: Abort the queue attached to consumers when producer fails
- repair: Fix hang when the writer is dead
Fixes#6241
Refs: #6248
"
* asias-stream_fix_multishard_writer_hang:
repair: Fix hang when the writer is dead
mutation_writer_test: Add test_multishard_writer_producer_aborts
multishard_writer: Abort the queue attached to consumers when producer fails
"
The backlog_controller has a timer that periodically accesses the
sstable writers of ongoing writes.
This patch series makes sure we remove entries from the list of ongoing
writes before the corresponding sstable writer is destroyed.
Fixes#6221.
"
* 'espindola/fix-6221-v5' of https://github.com/espindola/scylla:
sstables: Call revert_charges in compaction_write_monitor::write_failed
sstables: Call monitor->write_failed earlier.
sstables: Add write_failed to the write_monitor interface
"Enabling TTL feature, add setex and ttl commands to use it."
* 'redis_setex_ttl' of git://github.com/syuu1228/scylla:
redis: add test for setex/ttl
redis: add ttl command
redis: add setex command
"Add test for lolwut command, and also fix a bug on lolwut found by the test."
* 'redis_lolwut_test' of git://github.com/syuu1228/scylla:
redis: lolwut parameter fix
redis-test: add lolwut test
Background:
Replace operation is used to replace a dead node in the cluster.
Currently during replace operation, the replacing node does not take any
writes. As a result, new writes to a range after the sync for that range
is done, e.g., after streaming for that range is finished, will not be
synced to the replacing node. Hinted hand off or repair after the
replacing operation will help. But it is better if we can make the
writes to the replacing node to avoid any post replacing operation
actions.
After this series and repair based node operation series, the replace
operation will guarantee the replacing node has all the latest copy of
data including the new writes during the replace operation. In short, no
more repairs before or after the replacing operation. Just replacing the
node is enough.
Implementation:
1) Filter the node being replaced out of the natural endpoints in
storage_proxy, so that:
- The node being replaced will not be selected as the target for
normal write or normal read.
- Do not depend on the gossip liveness to avoid selecting replacing node
for normal write or normal read when the replacing node has the same
ip address as the node being replaced. No more special handling for
hibernate state in gossip which makes it is simpler and more robust.
Replacing node will be marked as UP.
2) Put the replacing node in the pending list, so that:
- Replacing node will take writes but write to replacing will not be
counted as CL.
- Replacing node will not take normal read.
Example:
For example, with RF = 3, n1, n2, n3 in the cluster, n3 is dead and
being replaced by node n4. When n4 starts:
- writes to nodes {n1, n2, n3} are changed to
normal_replica_writes = {n1, n2} and pending_replica_writes= {n4}.
- reads to nodes {n1, n2, n3} are changed to
normal_replica_reads = {n1, n2} only.
This way, the replacing node n4 now takes writes but does not take reads.
Tests:
1) Measure the number of writes during pending period that is the
replacing starts and finishes the replace operation.
- Start 5 nodes, n1 to n5.
- Stop n5
- Start write in the background
- Start n6 to replace n5
- Get scylla_database_total_writes metrics when the replacing node announces HIBERNATE (replacing) and NORMAL status.
Before:
2020-02-06 08:35:35.921837 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:35:35.939493 scylla_database_total_writes: node1={'scylla_database_total_writes': 15483}
2020-02-06 08:35:35.950614 scylla_database_total_writes: node2={'scylla_database_total_writes': 15857}
2020-02-06 08:35:35.961820 scylla_database_total_writes: node3={'scylla_database_total_writes': 16195}
2020-02-06 08:35:35.978427 scylla_database_total_writes: node4={'scylla_database_total_writes': 15764}
2020-02-06 08:35:35.992580 scylla_database_total_writes: node6={'scylla_database_total_writes': 331}
2020-02-06 08:36:49.794790 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:36:49.809189 scylla_database_total_writes: node1={'scylla_database_total_writes': 267088}
2020-02-06 08:36:49.823302 scylla_database_total_writes: node2={'scylla_database_total_writes': 272352}
2020-02-06 08:36:49.837228 scylla_database_total_writes: node3={'scylla_database_total_writes': 274004}
2020-02-06 08:36:49.851104 scylla_database_total_writes: node4={'scylla_database_total_writes': 262972}
2020-02-06 08:36:49.862504 scylla_database_total_writes: node6={'scylla_database_total_writes': 513}
Writes = 513 - 331
After:
2020-02-06 08:28:56.548047 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:28:56.560813 scylla_database_total_writes: node1={'scylla_database_total_writes': 290886}
2020-02-06 08:28:56.573925 scylla_database_total_writes: node2={'scylla_database_total_writes': 310304}
2020-02-06 08:28:56.586305 scylla_database_total_writes: node3={'scylla_database_total_writes': 304049}
2020-02-06 08:28:56.601464 scylla_database_total_writes: node4={'scylla_database_total_writes': 303770}
2020-02-06 08:28:56.615066 scylla_database_total_writes: node6={'scylla_database_total_writes': 604}
2020-02-06 08:29:10.537016 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:29:10.553257 scylla_database_total_writes: node1={'scylla_database_total_writes': 336126}
2020-02-06 08:29:10.567181 scylla_database_total_writes: node2={'scylla_database_total_writes': 358549}
2020-02-06 08:29:10.581939 scylla_database_total_writes: node3={'scylla_database_total_writes': 351416}
2020-02-06 08:29:10.595567 scylla_database_total_writes: node4={'scylla_database_total_writes': 350580}
2020-02-06 08:29:10.610548 scylla_database_total_writes: node6={'scylla_database_total_writes': 45460}
Writes = 45460 - 604
As we can see the replacing node did not take write before and take write after the patch.
2) Check log of writer handler in storage_proxy
storage_proxy - creating write handler for token: -2642068240672386521,
keyspace_name=ks, original_natrual={127.0.0.1, 127.0.0.5, 127.0.0.2},
natural={127.0.0.1, 127.0.0.2}, pending={127.0.0.6}
The node being replaced, n5=127.0.0.5, is filtered out and the replacing
node, n6=127.0.0.6 is in the pending list.
Fixes: #5482
We will change the update of tokens in token_metadata in the next patch
so that the tokens of the replacing node are updated to token_metadata
only after the replace operation is done. In order to get the correct
ranges for the replacing node in do_rebuild_replace_with_repair, we need
to use a copy of token_metadata contains the tokens of the replacing
node.
Refs: #5482
It is useful when the caller wants to calculate ranges using a
custom token_metadata.
It will be used soon in do_rebuild_replace_with_repair for replace
operation.
Refs: #5482
Decide if the replication strategy allow removing the node being replaced from
the natural endpoints when a node is being replaced in the cluster.
LocalStrategy is the not allowed to do so because it always returns the node
itself as the natural_endpoints and the node will not appear in the
pending_endpoints.
It is needed by the "Make replacing node take writes" work.
Refs: #5482
The make_descriptor() function parses a string representation of sstable
version using a ternary operator. Clean it up by using
sstables::from_string(), which is future-proof when we add support for
later sstable formats.
Message-Id: <20200429082126.15944-1-penberg@scylladb.com>
Currently, after the replacing node finishes the replace operation, it
removes the node being replaced from gossip directly in
storage_service::join_token_ring() with gossiper::replaced_endpoint(),
so the gossip states for the replaced node is gone.
When other nodes knows the replace operation is done, they will call
storage_service::remove_endpoint() and gossiper::remove_endpoint() to
quarantine the node but keep the gossip states. To prevent the
replacing node learns the state of replaced node again from existing
node again, the replacing node uses 2X quarantine time.
This makes the gossip states for the replaced node different on other
nodes and replacing nodes. It makes it is harder to reason about the
gossip states because the discrepancy of the states between nodes.
To fix, we unify the handling of replaced node on both replacing node
and other nodes. On all the nodes, once the replacing node becomes
NORMAL status, we remove the replaced node from token_metadata and
quarantine it but keep the gossip state. Since the replaced node is no
longer a member of the cluster, the fatclient timer will count and
expire and remove the replaced node from gossip.
Refs: #5482
The motivation is to make the replacing node has the same view of the
token ring as the rest of the cluster.
If the replacing node has the same ip of the node being replaced, we
should update the tokens in token_metadata when the replace operation
starts, so that this replacing node and the rest of the cluster see the
same token ring.
If the replacing node has the different ip address of the node being
replaced, we should update the tokens in token_metadata only when
replace operation is done, because the other nodes will update the
replacing node's token in token_metadata when the replace operation is
done.
Refs: #5482
The alternator test, test/alternator/run, runs Scylla and runs the
various tests against it. Before this patch, just booting Scylla took
about 26 seconds (for a dev build, on my laptop). This patch reduces
this delay to less than one second!
It turns out that almost the entire delay was artificial, two periods
of 12 seconds "waiting for the gossip to settle", which are completely
unnecessary in the one-node cluster used in the Alternator test.
So a simple "--skip-wait-for-gossip-to-settle 0" parameter eliminates
these long delays completely.
Amusingly, the Scylla boot is now so fast, that I had to change a "sleep 2"
in the test script to "sleep 1", because 2 seconds is now much more than
it takes to boot Scylla :-)
Fixes#6310.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200428145035.22894-1-nyh@scylladb.com>
Client drivers act differently on errors codes they don't recognize.
Adding new errors codes is considered a protocol extension and
should be negotiated with the client.
This change keeps `overflow_error_exception` internally but uses
the INVALID cql error code to return the error message back to the client
similar to keyspace_not_defined_exception.
We (and cassandra) already use `invalid_request_exception` extensively
to return various errors related to invalid values or types in the query.
Fixes#6264
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Gleb Natapov <gleb@scylladb.com>
Message-Id: <20200422130011.108003-1-bhalevy@scylladb.com>
The ScanIndexForward parameter is now fully implemented
and can accept ScanIndexForward=false in order to query
the partitions in reverse clustering order.
Note that reading partition slices in reverse order is less
efficient than forward scans and may put a strain on memory
usage, especially for large partitions, since the whole partition
is currently fetched in order to be reversed.
Fixes#5153
1. Remove the `versioned_value::factory` class, it didn't add any value. It just
forced us to create an object for making `versioned_value`s, for no sensible
reason.
2. Move some `versioned_value` deserialization code (string -> internal data
structures) into the versioned_value module. Previously, it was scattered all
around the place.
3. Make `gossiper::get_seeds` const and return a const reference.
I needed these refactors for a PR I was preparing to fix an issue with CDC. The
attempt of fixing the issue failed (I'm trying something different now), but the
refactors might be useful anyway.
* kbr--vv-refactor:
gossiper: make `get_seeds` method const and return a const ref
versioned_value: remove versioned_value::factory class
gms: move TOKENS string deserialization code into versioned_value
This patch allows users of storage_proxy::cas() to supply nullptr
as `query::read_command` which is supposed to skip the procedure
of reading the existing value.
The feature is used in alternator code for Read-Modify-Write
operations: some of them don't require reading previous item
values before updating.
Move `read_nothing_read_command` from alternator code to
storage_proxy layer and fabricate a new no-op command from it when
storage_proxy::cas() is used with nullptr read_command.
This allows to avoid sprinkling if-else branches all over the code
in order to check for null-equality of `cmd`.
We return from storage_proxy::query() very early with an empty
result in case we're given an empty partition_slice (which resides
inside the passed `read_command`) so this approach should be
perfectly fine.
Expand documentation for the `cas()` function to cover new
possible value for `cmd` argument.
Fixes: #6238
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200428065235.5714-1-pa.solodovnikov@scylladb.com>
Consdier:
When repair master gets data from repair follower:
1) apply_rows_on_master_in_thread is called
2) a repair writer is created with _repair_writer.create_writer
3) the repair writer fails
4) data is written to the queue _mq[node_idx]->push_eventually attached
with the writer
Since the writer is dead. No one is going to fetch data from the _mq
queue. The apply_rows_on_master_in_thread will block forever.
To fix, when the writer is failed, we should abort the _mq queue.
Refs: #6248
If no keyspace is specified when taking snapshot, there will be a segfault
because keynames is unconditionally dereferenced. Let's return an error
because a keyspace must be specified when column families are specified.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200427195634.99940-1-raphaelsc@scylladb.com>
This patch adds a '--with-seastar=<PATH>' option to configure.py, which
allows user to override the default seastar submodule path. This is
useful when building packages from source tarballs, for example.
Message-Id: <20200427165511.6448-1-penberg@scylladb.com>
We still call it in the destructor or to cover the successful case. We
can't do that in on_data_write_completed because it is too early.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
A writer is destroyed just before consume_in_thread returns, since the
adapter takes ownership of it.
The problem is that a monitor can keep a reference to the a
writer_offset_tracker that is owned by that writer.
The monitor is accessed periodically via
backlog_controller::_update_timer. This means we have to deregister
from the list of ongoing writes before the writer is destroyed.
If the write fails, the deregistration happens in write_failed, but it
is currently called after the writer is destroyed.
This patch moves the call to write_failed to the writer destructor as
I could not find a convenient location to put it.
Since the writer is destroyed in consume_in_thread, we could call it
there, but then we also have to update consume.
The is a similar problem with the case where the sstable is written
correctly. That will be fixed in the next patch.
Fixes#6221.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Only database_sstable_write_monitor needs it so far, but the call
needs to be moved earlier, which requires calling it in code paths
that don't know about database_sstable_write_monitor.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
that code doesn't run under a thread, so let's futurize it.
the code worked with single cpu because get() returns right away
due to no deferring point.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200427155303.82763-1-raphaelsc@scylladb.com>
The Alternator test's run script, test/alternator/run, runs Scylla.
By default, it chooses the last built Scylla executable build/*/scylla.
However, test.py has a "mode" option, that should be able to choose which
build mode to run. Before this patch, this mode option wasn't honored by
the Alternator test, so a "test.py alternator/run" would run the same
Scylla binary (the one last built) three times, instead of running each
of the three build modes.
We fix this in this patch: test.py now passes the "SCYLLA" environment
variable to the test/alternator/run script, indicating the location of the
Scylla binary with the appropriate build mode. The script already supported
this environment variable to override its default choice of Scylla binary.
In test.py, we add to the run_test() function an optional "env" parameter
which can be used to pass additional environment variables to the test.
Fixes#6286
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200427131958.28248-1-nyh@scylladb.com>
If you run "dbuild" on a freshly installed machine, the error message is
not the most helpful one. Fix it up.
Before:
$ ./tools/toolchain/dbuild
./tools/toolchain/dbuild: line 113: docker: command not found
./tools/toolchain/dbuild: line 156: docker: command not found
After:
$ ./tools/toolchain/dbuild
dbuild: Please install Docker on this machine to run dbuild.
Run `./tools/toolchain/dbuild --help' to print the full help message.
Message-Id: <20200426192746.11034-1-penberg@scylladb.com>
Fixes#6202
Distributed loader sstable opening is gated through the
database::sstable_load_concurrency_sem() semaphore
(at a concurrency of 3).
This is (according to creation comment) to reduce memory footprint
during bootstrap, by partially serializing the actual opening of
existing sstables.
However, in certain versions of the product, there exist circular
dependencies between data in some sstables and the ability to actually
read others. Thus when gated as above, we can end up with the
dependents acquiring the semaphore fully, and once stuck waiting for
population of their dependency effectively blocking this from ever
happening.
Since we probably do not want to remove the concurrency control,
and increasing it would only push the problem further away,
we solve the issue by adding the ability to mark certain keyspaces
as "prioritized" (pre-bootstrap), and allow them to populate outside
the normal concurrency control semaphore. Concurrency increase is
however limited to one extra sstable per shard and prio keyspace.
Message-Id: <20200415102431.20816-1-calle@scylladb.com>
from Juliusz.
CL of LOCAL_QUORUM used to be hardcoded into CDC preimage query
and led to an error every time the number of replicas was lower than CL
could require. The solution here is to link the CLs of writes
to base table with the CLs of CDC reads, so the client will get
the (limited) control over the consistency of preimage SELECTs
(instead of constant misleading errors).
The algorithm is as follows:
1. If write that caused CDC activity was done with CL = ANY,
then do preimage read with CL = ONE.
2. If write that caused CDC activity was done with CL = ALL,
then do preimage read with CL = QUORUM.
3. SERIAL and LOCAL_SERIAL writes cause preimage read with QUORUM
and LOCAL_QUORUM, respectively.
4. In other cases do preimage read with the same CL as base write.
To further mitigate the incomprehensible error being sent to client,
I wrapped the preimage's SELECT query in try-catch and
intercept the `unavailable_exception`, which was manifesting as
`NoHostAvailable` in Python and Java drivers. Now client gets a new
error code and a message specific to the issue of CL not being met
by the preimage query.
Fixes#5746
* jul-stas-5746-cdc-replication-factor:
cdc: fix the "NoHostAvailable" client error when CL is not met
cdc: CL for preimage select is calculated from base write CL
This commit resolves the client-observable effect of CDC read
consistencies. I wrapped the preimage's SELECT query in try-catch to
intercept the `unavailable_exception`, which led to misleading
`NoHostAvailable` in Python and Java drivers. Now client gets a new
error code and a message specific to the issue of CL not being met
by the preimage query.
Fixes#5746
Queries with `ALLOW FILTERING` and constraints on counter
values used to be rejected as "unimplemented". The reason
was a missing tri-comparator, which is added in this patch.
Fixes#5635
* jul-stas-5635-filtering-on-counters:
cql/tests: Added test for filtering on counter columns
counters: add comparator and remove `unimplemented` from restrictions
The xxhash library has been packaged by Fedora, so we can use it
instead of carrying the submodule. This reduces allows us to
receive updates as the OS packages are updated. Build time will
not be reduced since it is a header-only library.
xxhash preserves the hash results across versions so rolling
upgrades will still work.
The frozen toolchain is updated with the new package.
Tests: unit (dev)
On Centos 7 machine:
fstrim.timer not enabled, only unmasked due scylla_fstrim_setup on installation
When trying run scylla-fstrim service manually you get error:
Traceback (most recent call last):
File "/opt/scylladb/scripts/libexec/scylla_fstrim", line 60, in <module>
main()
File "/opt/scylladb/scripts/libexec/scylla_fstrim", line 44, in main
cfg = parse_scylla_dirs_with_default(conf=args.config)
File "/opt/scylladb/scripts/scylla_util.py", line 484, in parse_scylla_dirs_with_default
if key not in y or not y[k]:
NameError: name 'k' is not defined
It caused by error in scylla_util.py
Fixes#6294.
The "jobs" script is used to determine the amount of compilation
parallelism on a machine. It attempts to ensure each GCC process has at
least 4 GB of memory per core. However, in the worst case scenario, we
could end up having the GCC processes take up all the system memory,
forcin swapping or OOM killer to kick in. For example, on a 4 core
machine with 16 GB of memory, this worst case scenario seems easy to
trigger in practice.
Fix up the problem by keeping a 1 GB of memory reserve for other
processes and calculating parallelism based on that.
Message-Id: <20200423082753.31162-1-penberg@scylladb.com>
When generating tokens for parallel scan, debug mode undefined behavior
sanitizer complained that integer overflow sometimes happens when
multiplying two big values - delta and segment number.
In order to mitigate this warning, the multiplication is now split
into two smaller ones, and the generated machine code remains
identical (verified on gcc and clang via compiler explorer).
Fixes#6280
Tests: unit(dev)
There's no indication that data needed for generating view updates
from staging sstables is going to be immediately useful for the
user, and a large amount of it can push hot rows out of the cache,
thus deteriorating performance.
Fixes#6233
Tests: unit(dev)
We want to test that a std::bad_alloc is thrown, but GCC 10 has a new
optimization (-fallocation-dce) that removes dead allocations.
This patch assigns the value returned by new to a global so that GCC
cannot delete it.
With this all tests in a dev build pass with GCC 10.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200424201531.225807-1-espindola@scylladb.com>
The configure option is --static-stdc++, to is surprising that it also
enables -static-libgcc.
Also, -static-libgcc doesn't seem to work with debug builds.
This patch removes -static-libgcc which fixes debug builds with
--static-stdc++. Such builds are convenient for testing new versions
of gcc.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200424214117.257195-1-espindola@scylladb.com>
This patch has two goals -- speed up the total partitions
calculations (walking databases is faster than walking tables),
and get rid og row_cache._partitions.size() call, which will
not be available on new _partitions collection implementation.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200423133900.27818-1-xemul@scylladb.com>
Current code gets table->row_cache->cache_tracker->region and sums
up the region's used space for all tables found.
The problem is that all row_cache-s share the same cache_tracker
object from the database, thus the resulting number is not correct.
Fix this by walking cache_tracker-s from databases instead.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200423133755.27187-1-xemul@scylladb.com>
These callbacks can block a seastar thread and the underlying vector
can be reallocated concurrently.
This is no different than if it was a plain std::vector and the
solution is similar: use values instead of references.
Fixes#6230
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200422182304.120906-1-espindola@scylladb.com>
This produces more compact code and avoids the anti-pattern of
building a map with statically known values. If the values are given
to GCC via a switch statement it can do a much better job at compile
time than libstdc++ can at runtime.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200422224905.198794-1-espindola@scylladb.com>
Alternator is supposed to use RF=3 for new tables. Only when the cluster is
smaller than 3 nodes do we use RF=1 (and warn about it) - this is useful for
testing.
However, our implementation incorrectly tested the number of *live* nodes in
the cluster instead of the total number of nodes. As a result, if a 3-node
cluster had one node down, and a new table was created, it was created with
RF=1, and immediately could not be written because when RF=1, any node down
means part of the data is unavailable.
This patch fixes this: The total number of nodes in the cluster - not the
number of live nodes - is consulted. The three-node-cluster-with-a-dead-node
setup above creates the table with RF=3, and it can be written because two
living nodes out of three are enough when RF=3 and we do quorum writes and
reads.
We have a dtest to reproduce this bug (and its fix), and it's also easy to
reproduce manually by starting a 3-node cluster, killing one of the nodes,
and then running "pytests". Before this patch, the tests can create tables
but then fail to write to them. After this patch, the test succeed on the
same cluster with the dead node.
Fixes#6267
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200422182035.15106-2-nyh@scylladb.com>
The gossiper has a convenience functions get_up_endpoint_count() and
get_down_endpoint_count(), but strangely no function to get the total
number. Even though it's easy to calculate the total by summing up their
result it is inefficient and also incovenient because of of these
functions returns a future.
So let's add another function, get_all_endpoint_count(), to get the
total number of nodes. We will use this function in the next patch.
Signed-off-by: Nadav Har'El <n...@scylladb.com>
Message-Id: <20200422182035.15106-1-nyh@scylladb.com>
After fixing issue #6260, the "parallel scan" feature in Alternator is
supported, so drop the sentence in alternator.md saying that it isn't.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200422090738.21648-1-nyh@scylladb.com>
The Alternator test boots Scylla to test against it. We set an arbitrary
timeout for this boot to succeed: 100 seconds. This 100 seconds is
significantly more than 25 seconds it takes on my laptop, and I though
we'll never reach it. But it turns out that in some setups - running the
very slow debug build on slow and overcommitted nodes - 100 seconds is
not enough.
So this patch doubles the timeout to 200 seconds.
Note that this "200 seconds" is just a timeout, and doesn't affect normal
runs: Both a successful boot and a failed boot are recognized as soon as
they happen, and we never unnecessarily wait the entire 200 seconds.
Fixes#6271.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200422193920.17079-1-nyh@scylladb.com>
This will allow deterministic stream_id generation
and would remove the risk of not being able to generate
a stream id for some vnode.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
* seastar-dev.git gleb/lwt-shared-proposal:
lwt: pass paxos::proposal as a shared pointer everywhere
lwt: do not copy proposal in paxos_state::accept
lwt: make load_paxos_state to take partition_key_view instead of a
deference
Some caller have partition_key_view, but not partition_key, so thy need
to create a temporary and copy just to pass a reference. Change it by
accepting a view.
paxos::proposal reference is passed into a lot of functions and sometimes
it has to be copied to prolong its lifetime. Create it as a shared
pointer and pass it everywhere to avoid those copies.
Fixes#6265
Return type for read_log_file was previously changed from
subscription to future<>, returning the previously returned
subscriptions result of done(). But it did not preserve the
subscription itself, which in turn will cause us to (in
work::stream), call back into a deleted object.
Message-Id: <20200422090856.5218-1-calle@scylladb.com>
Parallel scans can be performed by providing Segment and TotalSegments
attributes to Scan request, which can be used to split the work among
many workers.
This test makes the parallel scan test succeed, so the xfail is removed.
Fixes#5059
The constructor of `read_command` is used both by IDL and clients in the
code. However, this constructor has a parameter that is not used by IDL:
`read_timestamp`. This requires that this parameter is the very last in
the list and that new parameters that are used by IDL are added before
it. One such new parameter was `bool is_first_page`. Adding this
parameter right before the read timestamp one created a situation where
the last parameter (read_timestamp) implicitly converts to the one
before it (is_first_page). This means that some call sites passing
`read_timestamp` were now silently converting this to `is_first_page`,
effectively dropping the timestamp.
This patch aims to rectify this, while also avoiding similar accidents
in the future, by making `is_first_page` a `bool_class` which doesn't
have any implicit convertions defined. This change does not break the
ABI as `bool_class` is also sent as a `bool` on the wire.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Tests: unit(dev)
Message-Id: <20200422073657.87241-1-bdenes@scylladb.com>
Casts only depend on their operands, so a plain function pointer is
sufficient. This allows replacing all the make_castas_* functions that
return a lambda with plain castas_* functions that do the casting.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200413162014.23884-2-espindola@scylladb.com>
It is currently not possible to wrap the view_updating_consumer in an
std::optional. I intend to do it to allow for compactions to optionally
generate view updates.
The reason for that is that view_updating_consumer has a reference as a
member, which makes the move assignment constructor not be implicitly
generated.
This patch fixes it by keeping a pointer instead of a reference.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200421123648.8328-1-glauber@scylladb.com>
This is a special partitioner that will be used by
CDC Log. It works only with partition key that is blob
composed of two ints. The first int is a token this
partitioner will map the key to. The second int is there
to make it possible to create multiple keys that are different
from each other but map to the same token.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Scylla inherited a concept of partitioners that preserve order of keys from
the origin but it is not used for anything. Moreover, none of the existing
partitioners preserves keys order. The only partitioner that did this in the
past was ByteOrderedPartitioner and Scylla does not support it any more.
For a partitioner to preserve an order of the keys means that if there are two
keys A and B such that A < B then token(A) < token(B) where token(X) isa token
the partitioner assignes to key X.
This patch removes dht::i_partitioner::preserves_order with all its overrides.
The only place that was using this member function was a check in thrift server
and it is safe to remove the check because the check was only done
to differentiate the error message for partitioners that do and do not preserve
the order of the keys.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
CL of LOCAL_QUORUM used to be hardcoded into CDC preimage query
and led to an error when number of replicas was lower than CL
would require. The solution here is to link the CLs of writes
to base table with the CLs of CDC reads, so the client will get
the (limited) control over the consistency of preimage SELECTs
(instead of getting error every time).
The algorithm is as follows:
1. If write that caused CDC activity was done with CL = ANY,
then do preimage read with CL = ONE.
2. If write that caused CDC activity was done with CL = ALL,
then do preimage read with CL = QUORUM.
3. SERIAL and LOCAL_SERIAL writes cause preimage read with QUORUM
and LOCAL_QUORUM, respectively.
4. In other cases do preimage read with the same CL as base write.
Currently, the alternator tests configure scylla to use all the
logical cores in the host system, but only 1GB of RAM. This can lead
to a small amount of memory per core.
It also uses the default disk configuration, which is safe, but can be
very slow on mechanical or non-enterprise disks.
Change to use a fixed --smp 2 configuration, and add --overprovisioned
for maximum flexibility (no spinning). Use --unsafe-bypass-fsync
for faster performance on non-enterprise or mechanical disks, assuming
that the test data is not important.
Fixes#6251.
Message-Id: <20200420154112.123386-1-avi@scylladb.com>
Merged patch series from Piotr Sarna:
This series allows reading rows from Scylla's system tables
via alternator by using a virtual interface.
If a Query or Scan request intercepts a table name with the following
pattern: .scylla.alternator.KEYSPACE_NAME.TABLE_NAME, it will read
the data from Scylla's KEYSPACE_NAME.TABLE_NAME table.
The interface is expected to only return data for Scylla system tables
and trying to access regular tables via this interface is expected
to return an error.
This series comes with tests (alternator-test, scylla_only).
Fixes#6122
Tests: alternator-test(local,remote (to verify that scylla_only works)
Piotr Sarna (5):
alternator: add fallback serialization for all types
alternator: add fetching static columns if they exist
alternator: add a way of accessing system tables from alternator
alternator-test: add scylla-only test for querying system tables
docs: add an entry about accessing Scylla system tables
alternator-test/test_system_tables.py | 61 +++++++++++++++++++++++++++
alternator/executor.cc | 38 ++++++++++++++++-
alternator/executor.hh | 1 +
alternator/serialization.cc | 11 +++--
docs/alternator/alternator.md | 15 +++++++
5 files changed, 122 insertions(+), 4 deletions(-)
create mode 100644 alternator-test/test_system_tables.py
And do the same with CDC_STREAMS_TIMESTAMP.
The code that took a list of tokens represented as a string inside
versioned_value (for gossiping) and deserialized it into
an `unordered_set<dht::token>` lived in the storage_service module,
while the code that did the serializing (set -> string) lived in
versioned_value. There was a similar situation with the CDC generation
timestamp.
To increase maintanability and reusability, the deserialization code is
now placed next to the serialization code in versioned_value.
Furthermore, the `make_full_token_string`, `make_token_string`, and
`make_cdc_streams_timestamp_string` (serialization functions) are moved
out of versioned_value::factory and made static methods of
versioned_value instead.
We have this in multishard_writer:
future<uint64_t> multishard_writer::operator()() {
return distribute_mutation_fragments().finally([this] {
return wait_pending_consumers();
}).then([this] {
return _consumed_partitions;
});
}
The wait_pending_consumers which waits for the consumers to finish is
called even when distribute_mutation_fragments fails.
When distribute_mutation_fragments fails and the failure is due to the
producer fails, consumers can wait for data which will never come because
the producer has failed already. This can cause a deadlock.
To fix, when distribute_mutation_fragments fails, we should abort the
queues that are attached to the readers used by the consumers.
Fixes#6241
-2^63 is a value reserved for min/max token boundaries and shouldn't be used for
regular tokens. This patch fixes get_random_token to never create token with
value -2^63. On the way dht::get_random_number template method is removed
because it was exclusively used by get_random_token.
Also use uniform_int_distribution with int64_t instead of uint64_t by using
correct constructor parameter that guarantees values between -2^63+1 and 2^63-1
inclusively.
Tests: unit(dev)
Fixes#6237.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Message-Id: <0a1a939355f5005039d5c2c7c513bad94cf60be2.1587302093.git.piotr@scylladb.com>
It turned out we cannot drop the information about most recent commit
entirely since it is used to cut off already outdate accepted values.
Otherwise the following scenario can happen:
1. cas1 prepares on A, B, C, gets one accept from A
2. cas2 prepares on B, C, gets 2 accepts on B and C, learns on B, C
3. cas3 initiates a prepare on A, learns about cas1's accept,
4. cas2 learns on A, prunes on A, B, C
Now cas3 will reply cas1's value because it does not know that it is
less than already committed on (removed during step 4).
The patch drops only committed value and keep the information about
latest committed ballot.
Fixed#6154
PAXOS node is allowed to accept a proposal without promising it
first as long as its ballot is greater than already promised one. Treat
such accepted ballot as promised since 'learn' stage removes accepted
ballot, but we still want to remember it as the latest promised one.
The goal is to be closer to formal PAXOS specification.
Max and min windows are microsecond timestamps, which should be divided
by window size in microseconds to properly estimate window count
based on provided mutation_source_metadata.
Found this problem after properly setting mutation_source_metadata with
min and max metadata on behalf of regular compaction.
Fixes#6214.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200409194235.6004-2-raphaelsc@scylladb.com>
"
Includes:
- code cleanups
- support for measuring data stores with more than one partition
- measure sstable footprint for all supported formats
- less verbose mode by default
"
* tag 'memory-footprint-test-improvement-v2' of github.com:tgrabiec/scylla:
test: memory_footprint: Silence logging by default
test: memory_footprint: Introduce --partition-count option
test: memory_footprint: Run under a cql_test_env
test: memory_footprint: Calculate sstable size for each format version
sstables: Move all_sstable_versions to version.hh
In order to avoid confusion with regard to whose responsibility
it is to sort the key columns (see #5856), the interface which allows
adding columns to the builder with explicit column id
is moved to a private function. An internal with_column_ordered()
overload is maintained to be used for internal operations,
but it's encouraged to use simpler with_column() in new code.
Fixes#6235
Tests: unit(dev)
When multiple key columns (clustering or partition) are passed to
the schema constructor, all having the same column id, the expectation
is that these columns will retain the order in which they were passed to
`schema_builder::with_column()`. Currently however this is not
guaranteed as the schema constructor sort key columns by column id with
`std::sort()`, which doesn't guarantee that equally comparing elements
retain their order. This can be an issue for indexes, the schemas of
which are built independently on each node. If there is any room for
variance between for the key column order, this can result in different
nodes having incompatible schemas for the same index.
The fix is to use `std::stable_sort()` which guarantees that the order
of equally comparing elements won't change.
This is a suspected cause of #5856, although we don't have hard proof.
Fixes: #5856
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
[avi: upgraded "Refs" to "Fixes", since we saw that std::sort() becomes
unstable at 17 elements, and the failing schema had a
clustering key with 23 elements]
Message-Id: <20200417121848.1456817-1-bdenes@scylladb.com>
The io_priority parameter used when generating view updates from
streaming is used by the sstable reader, so it should use the I/O priority
for streaming *read* operations, not streaming *write* operations.
Fixes#6231
Tests: unit(dev)
Commitlog replay is given a filename prefix to filter files against, but it
ignores it. As a result we will replay anything in that directory, including
recycled segments, which is wasteful.
Fix by adding a check for the prefix.
Tests: unit (dev), manual test that regular commitlog files are not
filtered.
Message-Id: <20200416174542.133230-1-avi@scylladb.com>
Some legacy `mc` SSTables (created in Scylla 3.0) may contain incorrect
serialization headers, which don't wrap frozen UDTs nested inside collections
with the FrozenType<...> tag. When reading such SSTable,
Scylla would detect a mismatch between the schema saved in schema
tables (which correctly wraps UDTs in the FrozenType<...> tag) and the schema
from the serialization header (which doesn't have these tags).
SSTables created in Scylla versions 3.1 and above, in particular in
Scylla versions that contain this commit, create correct serialization
headers (which wrap UDTs in the FrozenType<...> tag).
This commit does two things:
1. for all SSTables created after this commit, include a new feature
flag, CorrectUDTsInCollections, presence of which implies that frozen
UDTs inside collections have the FrozenType<...> tag.
2. when reading a Scylla SSTable without the feature flag, we assume that UDTs
nested inside collections are always frozen, even if they don't have
the tag. This assumption is safe to be made, because at the time of
this commit, Scylla does not allow non-frozen (multi-cell) types inside
collections or UDTs, and because of point 1 above.
There is one edge case not covered: if we don't know whether the SSTable
comes from Scylla or from C*. In that case we won't make the assumption
described in 2. Therefore, if we get a mismatch between schema and
serialization headers of a table which we couldn't confirm to come from
Scylla, we will still reject the table. If any user encounters such an
issue (unlikely), we will have to use another solution, e.g. using a
separate tool to rewrite the SSTable.
Fixes#6130.
Clean up the alternator.md document, by:
* Updating out-of-date information that outstayed its welcome.
* When Scylla does have a feature but it's just not supported via the
DynamoDB API (e.g., CDC and on-demand backups) mention that.
* Remove mention of Alternator being experimental and users should not
store important data on it :-)
* Miscellaneous cleanups.
Fixes#6179.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200412094641.27186-1-nyh@scylladb.com>
There is no reason to read a single SSTable at a time from the staging
directory. Moving SSTables from staging directory essentially involves
scanning input SSTables and creating new SSTables (albeit in a different
directory).
We have a mechanism that does that: compactions. In a follow up patch, I
will introduce a new specialization of compaction that moves SSTables
from staging (potentially compacting them if there are plenty).
In preparation for that, some signatures have to be changed and the
view_updating_consumer has to be more compaction friendly. Meaning:
- Operating with an sstable vector
- taking a table reference, not a database
Because this code is a bit fragile and the reviewer set is fundamentally
different from anything compaction related, I am sending this separately
* glommer-view_build:
staging: potentially read many SSTables at the same time
view_build_test: make sure it works with smp > 1
There is no reason to read a single SSTable at a time from the staging
directory. Moving SSTables from staging directory essentially involves
scanning input SSTables and creating new SSTables (albeit in a different
directory).
We have a mechanism that does that: compactions. In a follow up patch, I
will introduce a new specialization of compaction that moves SSTables
from staging (potentially compacting them if there are plenty).
In preparation for that, some signatures have to be changed and the
view_updating_consumer has to be more compaction friendly. Meaning:
- Operating with an sstable vector
- taking a table reference, not a database
Because this code is a bit fragile and the reviewer set is fundamentally
different from anything compaction related, I am sending this separately
Signed-off-by: Glauber Costa <glauber@scylladb.com>
This test doesn't work with higher smp counts, because it relies on
dealing with keys named 'a' and 'b' and creates SSTables containing one
of them manually. This throws an exception if we happen to execute on
a shard that don't own the tokens corresponding to those keys.
This patch avoids that problem by pre-selecting keys that we know to
belong to the current shard in which the test is executed.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Rename inherited metrics cas_propose and cas_commit
to cas_accept and cas_learn respectively.
A while ago we made a decision to stick to widely accepted
terms for Paxos rounds: prepare, accept, learn. The rest
of the code is using these terms, so rename the metrics
to avoid confusion/technical debt.
While at it, rename a few internal methods and functions.
Fixes#6169
Message-Id: <20200414213537.129547-1-kostja@scylladb.com>
Initially we were storing CDC options in scylla tables but then we realized
that we can use schema extensions. Extensions are more flexible and cause less
problems with schema digest.
The transition was done in 4.0 and with that we stopped reading 'cdc' column
in scylla tables. Commit 861c7b5626 removed
the code that used to read 'cdc' column.
Since no Scylla node should be reading 'cdc' column, we can always keep
it empty now. This will allow removal of schema::cdc_options in the future.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
When I/O error (e.g. EMFILE / ENOSPC) happens we hit
an assert in ~append_challenged_posix_file_impl(): Assertion _closing_state == state::closed' failed.
Commit 6160b9017d add close on failure
of the lamda defined in allocate_segment_ex, but it doesn't handle an error
after the file is opened/created while it is wrapped with commitlog_file_extensions.
Refs #5657
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Calle Wilund <calle@scylladb.com>
Message-Id: <20200414115231.298632-1-bhalevy@scylladb.com>
Fixes#6195
test_commitlog_delete_when_over_disk_limit reads current segment list
in flush handler, to compare with result after allowing deletetion of
segement. However, it might be called more than once in rare cases,
because timing and us using rather small sizes.
Reading the list the second time however is not a good idea, because
it might just very well be exactly the same as what we read in the
test check code, and we actually overwrite the list we want to
check against. Because callback is on timer. And test is not.
Message-Id: <20200414114322.13268-1-calle@scylladb.com>
"
This is a continuation of recent efforts to cover more and more internal
de-serialization paths with `on_internal_error()`. Errors like this
should always be investigated but this can only be done with a core.
This patch covers the error paths of `composite::iterator` with
`on_internal_error()`. As we need this patch to investigate a 4.0
blocker issue (#6121) it only does the minimal amount of changes needed
to allow generating a core for de-serializiation failures of composites.
There are a few FIXMEs left in the code that I plan to address in a
follow-up.
Ref: #6121
"
* 'compound-on-internal-error/v1' of https://github.com/denesb/scylla:
compound_compat: composite::iterator cover error-paths with on_internal_error()
compound_compat: composite_view: add is_valid()
For a column of type `frozen<list<T>>` in base table, a corresponding
column of type `frozen<map<timeuuid, T>>` is created in cdc log.
Although a similar change of type takes place in case of non-frozen
lists, this is unneeded in case of frozen lists - frozen collections are
atomic, therefore there is no need for complicated type that will be
able to represent a column update that depends on its previous value
(e.g. appending elements to the end of the list).
Moreover, only cdc log table creation logic performs this type change
for frozen lists. The logic of `transformer::transform`, which is
responsible for creation of mutations to cdc log, assumes that atomic
columns will have their types unchanged in cdc log table. It simply
copies new value of the column from original mutation to the cdc log
mutation. A serialized frozen list might be copied to a field that is of
frozen map type, which may cause the field to become impossible to
deserialize.
This patch causes frozen list base table columns to have a corresponding
column in cdc log with the same type.
A test is added which asserts that the type of cdc log columns is not
changed in the case of frozen base columns.
Tests: unit(dev)
Fixes#6172
We have in alternator-test a set of over 340 functional tests for
Alternator. These tests are written in Python using the pytest
framework, expect Scylla to be running and connect to it using the
DynamoDB API with the "boto3" library (the AWS SDK for Python).
We have a script alternator-test/run which does everything needed
to run all these tests: Starts Scylla with the appropriate parameters
in a temporary directory, runs all the tests against it, and makes
sure the temporary directory is removed (regardless of whether the
tests succeeded or failed).
The goal of this small patch series is to integrate these Alternator
tests into test.py in a *simple* way. The idea is that we add *one*
test which just runs the aforementioned "run" script which does its
own business.
The changes we needed to do in this series to achieve this are:
1. Make the alternator-test/run script pick a unique IP address on which
to listen, instead of always using 127.0.0.1. This allows running
this test in parallel with dtest tests, or even parallel to itself.
2. Move the alternator-test directory to test/alternator. This is
the directory where test.py expects all the tests to live in.
It also makes sense - since we already have multiple subdirectories
in test/, to put the Alternator tests there too.
3. Add a new test suite type, "Run". A "Run" suite is simply a directory
with a script called "run", and this script is run to run the entire
suite, and this script does its own business.
4. Tests (such as the new "Run" ones) who can be killed gently and clean
up after themselves, should be killed with SIGTERM instead of
SIGKILL.
After this series, to run the Alternator tests from test.py, do:
./test.py --mode dev alternator
Note that in this version, the "--mode" has no effect - test/alternator/run
always runs the latest compiled Scylla, regardless of the chosen mode.
This can be fixed later.
The Alternator tests can still be run manually and individually against
a running Scylla or DynamoDB as before - just go to the test/alternator
directory and run "pytest" with the desired parameters.
Fixes#6046
* nyh/alternator-test-v3:
alternator-test: make Alternator tests runnable from test.py
test.py: add xunit XML output file for "Run" tests
test.py: add new test type "Run"
test.py: flag for aborting tests with SIGTERM, not SIGKILL
alternator-test: change "run" script to pick random IP address
alternator-test: add "--url" option to choose Alternator's URL
Since 956b092012 (Merge "Repair based node
operation" from Asias), repair is used by other node operations like
bootstrap, decommission and so on.
Send the reason for the repair, so that we can handle the materialized
view update correctly according to the reason of the operation. We want
to trigger the view update only if the repair is used by repair
operation. Otherwise, the view table will be handled twice, 1) when the
view table is synced using repair 2) when the base table is synced using
repair and view table update is triggered.
Fixes#5930Fixes#5998
Currently, lolwut with some parameters output broken square,
such as "lolwut 10 1 1":
127.0.0.1:6379> lolwut 10 1 1
⠀⡤⠤⠤⠤⠤⠤⠤⠤⠤
⠀⡇⠀⠀⠀⠀⠀⠀⠀⠀
⠀⡇⠀⠀⠀⠀⠀⠀⠀⠀
⠀⡇⠀⠀⠀⠀⠀⠀⠀⠀
It because we passes incorrect parameters on draw_schotter().
Fixes#5808
Seems some gcc:s will generate the code as sign extending. Mine does not,
but this should be more correct anyhow.
Added small stringify test to serialization_test for inet_address
"
We currently allow null on the right-hand side of certain relations, while Cassandra prohibits it. Since our handling of these null values is mostly incorrect, it's better to match Cassandra in prohibiting it.
See the discussion (https://github.com/scylladb/scylla/pull/5763#discussion_r405557323.
NB: any reverse mismatch (Scylla prohibiting something that Cassandra allows) is left remaining. For example, we forbid null bounds on clustering columns, which Cassandra allows.
Tests: unit (dev)
"
* dekimir-match-cass-null:
restrictions: Forbid null bound for nonkey columns
restrictions: Forbid null equality
To make the tests in alternator-test runnable by test.py, we need to
move the directory alternator-test/ to test/alternator, because test.py
only looks for tests in subdirectories of test/. Then, we need to create
a test/alternator/suite.yaml saying that this test directory is of type
"Run", i.e., it has a single run script "run" which runs all its tests.
The "run" script had to be slightly modified to be aware of its new
location relative to the source directory.
To run the Alternator tests from test.py, do:
./test.py --mode dev alternator
Note that in this version, the "--mode" has no effect - test/alternator/run
always runs the latest compiled Scylla, regardless of the chosen mode.
The Alternator tests can still be run manually and individually against
a running Scylla or DynamoDB as before - just go to the test/alternator
directory (instead of alternator-test previously) and run "pytest" with
the desired parameters.
Fixes#6046
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Assumes that "Run" tests can take the --junit-xml=<path> option, and
pass it to ask the test to generate an XML summary of the run to a file
like testlog/dev/xml/run.1.xunit.xml.
This option is honored by the Alternator tests.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch adds a new test type, "Run". A test subdirectory of type "Run"
has a script called "run" which is expected to run all the tests in that
directory.
This will be used, in the next patch, by the Alternator functional tests.
These tests indeed have a "run" script, which runs Scylla and then runs
*all* of Alternator's tests, finishing fairly quickly (in less than a
minute). All of that will become one test.py test.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Today, if test.py is interrupted with SIGINT or SIGTERM, the ongoing test
is killed with SIGKILL. Some types of tests - such as Alternator's test -
may depend on being killed politely (e.g., with SIGTERM) to clean up
files.
We cannot yet change the signal to SIGTERM for all tests, because Seastar
tests often don't deal well with signals, but we can at least add a flag
that certain test types - that know they can be killed gently - will use.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Before this patch, the Alternator tests "run" script ran Scylla on a fixed
listening address, 127.0.0.1. There is a problem that there might be other
concurrent runs of Scylla using the same IP address - e.g., CCM (used by
dtest) uses exactly this IP address for its first node.
Luckily, Linux's loopback device actually allows us to pick any of over
a million addresses in 127.0.0.0/8 to listen on - we don't need to use
127.0.0.1 specifically. So the code in this patch picks an address in
127.1.*.*, so it cannot collide with CCM (which uses 127.0.0.* for up to
255 nodes). Moreover, the last two bytes of the listen address are picked
based on the process ID of the run script; This allows multiple copies
of this script to run concurrently - in case anybody wishes to do that.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The "--aws" and "--local" test options chooses between two useful default
URLs - Amazon's, or http://localhost:8000 for a local installation.
However, sometimes one wants to run Scylla on a different IP address or
port, so in this patch we add a "--url" option to choose a specific URL to
connect to. For example, "--url http://127.1.2.3:1234".
We will later use this option in the alternator-test/run script, to pick
a random IP address on which to run Scylla, and then run the test against
this address.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This reverts commit 1c444b7e1e. The test
it adds sometimes fails as follows:
test/boost/sstable_datafile_test.cc(1076): fatal error: in "autocompaction_control_test":
critical check cm->get_stats().pending_tasks == 1 || cm->get_stats().active_tasks == 1 has failed
Ivan is working on a fix, but let's revert this commit to avoid blocking
next promotion failing from time to time.
The first test case checks that system tables are readable via
Scan/Query requests.
The second test case checks that it's not possible to read user tables
by using the virtual interface.
The third test case checks that creating a table which looks like
an internal system table pattern (.scylla.alternator.KS_NAME.TABLE_NAME)
is not possible and returns a validation error.
Scylla's system tables often provide interesting information for
clients. In order to be able to access this information without CQL,
a notion of virtual tables is introduced to alternator.
When a table named .scylla.alternator.KS_NAME.TABLE_NAME is accessed
with read-only operation - Query or Scan, Scylla's internal
KS_NAME.TABLE_NAME table will be queried instead. For instance,
if a user wants to read about system_auth.roles, the Scan request
should target the following table: ".scylla.alternator.system_auth.roles".
Fixes#6122
Until now, the list of static column ids was always empty for alternator
tables anyway, so the list wasn't fetched. However, with the virtual
interface of fetching Scylla internal tables, we need to list the ids
of selected static columns explicitly to avoid segfaults - since we
select the whole row, static columns included.
While most types (e.g. boolean) are not valid key types for alternator users,
system tables derived from Scylla may still use this type for keys,
e.g. system_auth.roles. Note that types which are not directly
supported by alternator (e.g. double) will not be representable
out-of-the-box - instead, they simply fall back to string, which is both
human-readable and supported by alternator.
This patch adds API endpoint /column_family/autocompaction/{name}
that listen to GET and POST requests to pick and control table
background compactions.
To implement that the patch introduces "_compaction_disabled_by_user"
flag that affects if CompactionManager is allowed to push background
compactions jobs into the work.
It introduces
table::enable_auto_compaction();
table::disable_auto_compaction();
bool table::is_auto_compaction_disabled_by_user() const
to control auto compaction state.
Fixes#1488Fixes#1808Fixes#440
Tests: unit(sstable_datafile_test autocompaction_control_test), manual
Casting a type to itself doesn't make sense, but it is harmless so allow
it instead of reporting a confusing error message that makes even less
sense:
InvalidRequest: Error from server: code=2200 [Invalid query]
message="org.apache.cassandra.db.marshal.BooleanType cannot be cast
to org.apache.cassandra.db.marshal.BooleanType"
Note that some types already supported self-casting, this patch just
extends this to all types in a forward compatible way.
Fixes: #5102
Tests: unit(dev), manual test casting boolean to boolean.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200408135041.854981-1-bdenes@scylladb.com>
If a table name is not found, it may still exist as a local index,
but the check tried to fetch a local index name regardless if it was
present in the request, which was a nullptr dereference bug.
Fixes#6161
Tests: alternator-test(local, remote)
Message-Id: <428c21e94f6c9e450b1766943677613bd46cbc68.1586347130.git.sarna@scylladb.com>
We typically use `std::bad_function_call` to throw from
mandatory-to-implement virtual functions, that cannot have a meaningful
implementation in the derived class. The problem with
`std::bad_function_call` is that it carries absolutely no information
w.r.t. where was it thrown from.
I originally wanted to replace `std::bad_function_call` in our codebase
with a custom exception type that would allow passing in the name of the
function it is thrown from to be included in the exception message.
However after I ended up also including a backtrace, Benny Halevy
pointed out that I might as well just throw `std:bad_function_call` with
a backtrace instead. So this is what this patch does.
All users are various unimplemented methods of the
`flat_mutation_reader::impl` interface.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200408075801.701416-1-bdenes@scylladb.com>
* seastar fd9af3a26...cce2ddac1 (6):
> rpc: fix build failures in C++14 mode due to std::string_view
> util/backtrace: introduce make_backtraced_exception_ptr()
> future: make do_for_each noexcept
> fair_queue rename the fair_queue_descriptor and change its default init
> future: do_with: make noexcept
> io_queue: batch communication with the fair_queue for ready requests
Fixes#6143
When doing post-image generation, we also write values for columns not
in delta (actual update), based on data selected in pre-image row.
However, if we are doing initial update/insert with only a subset of
columns, when the pre-image result set is nil, this cannot be done.
Adds check to non-touched column post-image code. Also uses the
pre-image value extractor to handle non-atomic sets properly.
Tests updated.
This miniseries provides workarounds for open-ended range tombstones
reportedly appearing in alternator tables. The issue was that
row tombstones created for tables without clustering keys look
like open-ended range tombstones, which confuses the LA/KA format
writer.
Tests: alternator-test(local)
Fixes#6035
Refs #6157
The following sleep injections are added to paxos_state:
* paxos_state_prepare_timeout (timeouts in paxos_state::prepare)
* paxos_state_accept_timeout (timeouts in paxos_state::accept)
* paxos_state_learn_timeout (timeouts in paxos_state::learn)
Tests: unit ({dev}), unit ({debug})
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200403092107.181057-1-alejo.sanchez@scylladb.com>
Creating an index on a table with only the partition key
can lead to open-ended range tombstones appearing,
if the indexed column is also the very same partition key -
which is quite a useless case, but it's allowed both by alternator
and DynamoDB. In order to make the tests pass when KA/LA sstables
are used, this test case is hereby skipped until further notice.
Refs #6157
As @tgrabiec helpfully pointed out, creating a row tombstone
for a table which does not have a clustering key in its schema
creates something that looks like an open-ended range tombstone.
That's problematic for KA/LA sstable formats, which are incapable
of writing such tombstones, so a workaround is provided
in order to allow using KA/LA in alternator.
Fixes#6035
I am about to change resharding to block the start of the node. Being a
somewhat slow operation, the timeout of 900 sec is guaranteed to trigger
in large nodes with lots of data.
This patch effectively disables the start timeout, while keeping the
stop timeout unchanged.
My preference would have been to use a timeout extension mechanism
during resharding. Systemd actually has such mechanism, where we can
send a message through sd_notify asking the timeout to be extended.
However such mechanism is not present in SystemD v219, used by RHEL7.
That means for RHEL7 we need a different way to deal with the timeout
anyway.
The second preference is also obviously to write "infinity" as the
timeout value. But guess what? SystemD v219 also has a bug in which
infinity is interepreted as zero
(https://bugzilla.redhat.com/show_bug.cgi?id=1446015)
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200407155754.10020-1-glauber@scylladb.com>
But only non-validation error paths. When validating we do expect it to
maybe fail, so we don't want to generate cores for validation.
Validation is in fact a de-serialization pass with some additional
checks. To be able to keep reusing the same code for de-serialization
and validation just with different error handling, introduce a
`strict_mode` flag that can be passed to `composite::iterator`
constructor. When in strict mode (the default) the iterator will convert
any `marshal_exception` thrown during the de-serialization to
`on_internal_error()`.
We don't want anybody to use the iterator in non-strict mode, besides
validation, so the iterator constructors are made private. This is
standard practice for iterators anyway.
Freezing is an expensive operation, that involves serializing the entire
mutation. Having an implicit freezing constructor means this can happen
as part of an implicit type conversion without the programmer even
noticing, even when this is not really necessary.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200407080245.234021-1-bdenes@scylladb.com>
Until now this was open-coded in `sstables::validate_min_max_metadata()`.
We want to cover non-validation compound de-serialization error-paths
with `on_internal_error()` and so we need more control over how
compounds are validated. As a first step we want to centralize
validation in the class itself as in the next patches they will use
private APIs to bypass `on_internal_error()` in the error paths during
validation.
Any alter table statement that doesn't explicitly set the default time
to live will reset it to 0.
That can be very dangerous for time series use cases, which rely on
all data being eventually expired, and a default TTL of 0 means
data never being expired.
Fixes#5048.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200402211653.25603-1-raphaelsc@scylladb.com>
podman is compatible with docker, but by default emits a manifest
format that is not understood by old docker clients, so give it
an extra flag to generate the old format instead.
Message-Id: <20200406134526.21521-1-avi@scylladb.com>
There is no reason why the table code has to be aware of the efforts of
rewriting (cleanup, scrub, upgrade) an SSTable versus compacting it.
Rewrite is special, because we need to do it one SSTable at a time,
without lumping it together. However, the compaction manager is totally
capable of doing that itself. If we do that, the special
"table::rewrite_sstables" can be killed.
This code would maybe be better off as a thread, where we wouldn't need
to keep state. However there are some methods like maybe_stop_on_error()
that expect a future so I am leaving this be for now. This is a cleanup
that can be done later.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200401162722.28780-2-glauber@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/6136 from
Piotr Sarna:
An empty partition/clustering key pair is a valid state of the
query paging state. Unfortunately, recent attempts at debugging
a flaky test (#5856) resulted in introducing an assertion (7616290)
which breaks when trying to generate a key from such a pair.
In order to keep the assertion (since it still makes sense in its
scope), but at the same time translate empty keys properly,
empty keys are now explicitly processed at the beginning of the
function.
This behaviour was 100% reproducible in a secondary index dtest below.
Fixes#6134
Refs #5856
Tests: unit(dev),
dtest(TestSecondaryIndexes.test_truncate_base)
To use install.sh as Scylla install script w/o using .rpm/.deb package,
we need to provide a way to upgrade Scylla version, not just install.
With --upgrade option, install.sh does not overwrite config files.
It will install <filename>.new file on same directory, when old config file and
new config file does not contain same data.
If old one and new one is exactly same, it will nothing.
To implement this, rewriting api_ui_dir/api_doc_dir path on scylla.yaml
moved from .rpm/.deb scriptlet to install.sh.
Fixes#5874
On some environment systemd-coredump does not work with symlink directory,
we can use bind-mount instead.
Also, it's better to check systemd-coredump is working by generating coredump.
To fix#5916, drop scylla_coredump_setup from .rpm %post scriptlet.
Fixes#5753Fixes#5916
"
This patch makes makes major compaction aware of time buckets
for TWCS. That means that calling a major compaction with TWCS
will not bundle all SSTables together, but rather split them
based on their timestamps.
There are two motivations for this work:
Telling users not to ever major compact is easier said than
done: in practice due to a variety of circumstances it might
end up being done in which case data will have a hard time
expiring later.
We are about to start working with offstrategy compactions,
which are compactions that work in parallel with the main
compactions. In those cases we may be converting SSTables from
one format to another and it might be necessary to split a single
big STCS SSTable into something that TWCS expects
In order to achieve that, we start by changing the way resharding works:
it will now work with a read interposer, similar to the one TWCS uses for
streaming data. Once we do that, a lot of assumptions that exist in the
compaction code can be simplified and supporting TWCS major
compactions become a matter of simply enabling its interposer in the
compaction code as well.
There are many further simplifications that this work exposes:
The compaction method create_new_sstable seems out of place. It is not
used by resharding, and it seems duplicated for normal compactions. We
could clean it up with more refactoring in a later patch.
The whole logic of the feed_writer could be part of the consumer code.
Testing details:
scylla unit tests (dev, release)
sstable_datafile_test (debug)
dtests (resharding_test.py)
manual scylla resharding
Fixes#1431
"
Reviewed-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
* 'twcs-major-v3' of github.com:glommer/scylla:
compaction: make major compaction time-aware with TWCS
compaction: do resharding through an interposer
mutation_writer: introduce shard_based splitting writer
mutation_writer: factor out part of the code for the timestamp splitter
compaction: abort if create_new_sstable is called from resharding
If get_schema_for_read() fails "prune" counter will not be decremented.
The patch fixes it by creating RAI object earlier. Also return releasing
of a mutation in release_mutation() which was dropped by mistake.
Fixes#6124
Message-Id: <20200405080233.GA22509@scylladb.com>
Since commit 9948f548a5, the LWT no longer
requires an "experimental" flag, so Alternator documents and scripts
which referred to the need for enabling experimental LWT, are fixed here
to no longer do that.
Fixes#6118.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200405143237.12693-1-nyh@scylladb.com>
When trying to get rid of a large stack warning for gossip test,
I found out that it actually does not run at all for multiple reasons:
1. It segfaults due to wrong initialization order
2. After fixing that, it segfaults on use-after-free (due to capturing
a shared pointer by reference instead of by copy)
3. After that, cleanups are in order:
* seastar thread does not need to be spawned inside another thread;
* default captures are harmful, so they're made explicit instead;
* db::config is moved to heap, to finally get rid of the warning.
Tests: manual(gossip)
Message-Id: <feaca415d0d29a16c541f9987645365310663630.1585128338.git.sarna@scylladb.com>
In order to check regressions related to #6136 and similar issues,
test cases for handling paging state with empty partition/clustering
key pair are added.
An empty partition/clustering key pair is a valid state of the
query paging state. Unfortunately, recent attempts at debugging
a flaky test resulted in introducing an assertion which breaks
when trying to generate a key from such a pair.
In order to keep the assertion (since it still makes sense in its
scope), but at the same time translate empty keys properly,
empty keys are now explicitly processed at the beginning of the
function.
This behaviour was 100% reproducible in a secondary index dtest below.
Fixes#6134
Refs #5856
Tests: unit(dev),
dtest(TestSecondaryIndexes.test_truncate_base)
Specify --pull in order to refresh the base image (some Fedora release).
Usually this is not important, because we run `dnf update`. But if the
cached image happens to be a pre-release version of Fedora, the image
will have the update-testing repository enabled, and we may get some
unwanted updates.
It's sad that we need two separate flags for correctness (the other
is --no-cache.
Message-Id: <20200405164227.8210-1-avi@scylladb.com>
The default serialization path for items was subtly broken -
instead of parsing JSON string representation of objects,
it tried to parse a regular string implementation - which is often
also a valid JSON, but nothing guarantees that it actually is.
Tests: alternator-test(local)
Message-Id: <e1668bf4e9029f2675a4ac28bb4598714575efeb.1586096732.git.sarna@scylladb.com>
Merge patch series from Piotr Sarna:
This series adds extra precautions against potential races
in view building. In particular, it was based on the following scenario:
1. View builder detects that a view V is no longer here, so it schedules
removing its info from bookkeeping, without any semaphores,
and this continuation gets preempted immediately.
2. A view is deleted and recreated with the same name - V.
3. View V building is finished.
4. The continuation from (1.) is finally executed, and it removes old view V
info from bookkeeping - which is a problem, since view building
bookkeeping is based on *names*, not *uuids* - consequently,
the new view bookkeeping info is erroneously removed.
The issue is solved by putting startup code (which also does cleanup
from point (1.)) under the same semaphore as other bookkeeping
operations. With that, it will be impossible to execute step (2.)
before (1.) ends, which effectively prevents the race.
Refs #6094 (possible fixes it too, but since I could not reproduce
the issue...)
Tests: unit(dev)
Piotr Sarna (4):
db,view: fix waiting for a view building future
db,view: remove unneeded implicit capture-by-reference
db,view: nitpick: change & operator to && for booleans
db,view: guard view builder startup with a semaphore
db/view/view.cc | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
This removes the need to include reactor.hh, a source of compile
time bloat.
In some places, the call is qualified with seastar:: in order
to resolve ambiguities with a local name.
Includes are adjusted to make everything compile. We end up
having 14 translation units including reactor.hh, primarily for
deprecated things like reactor::at_exit().
Ref #1
This allows us to drop a #include <reactor.hh>, reducing compile time.
Several translation units that lost access to required declarations
are updated with the required includes (this can be an include of
reactor.hh itself, in case the translation unit that lost it got it
indirectly via logalloc.hh)
Ref #1.
The startup routine performs some bookkeeping operations on views,
and so do these events:
- on_create_view;
- on_drop_view;
- on_update_view.
Since the above events are guarded with a semaphore, the startup
routine should also take the same semaphore - in order to ensure
that all bookkeeping operations are serialized.
Refs #6094
The future was marked with a `FIXME: discarded future`, but there's really
no reason not to wait for it, and it was probably meant to be waited for
since its implementation.
When we switched token representation to int64_t
we added some sanity checks that byte representation
is always 8 bytes long.
It turns out that for token_kind::before_all_keys and
token_kind::after_all_keys bytes can sometimes be empty
because for those tokens they are just ignored. The check
introduced with the change is too strict and sometimes
throws the exception for tokens before/after all keys
created with empty bytes.
This patch relaxes the condition of the check and always
uses 0 as value of _data for special before/after all keys
tokens.
Fixes#6131
Tests: unit(dev, sct)
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
* seastar 41c83ec...fd9af3a (7):
> stall_detector: Delete unused member variable
> future: Avoid a move in finally_body
> Merge "Followup cleanups for the apply/invoke split" from Rafael
> Merge "make trivial future related functions noexcept" from Benny
> rpc_test: silence depreceted lambda logger warning
> rpc_demo: stop using variadic futures
> future: Move two static_asserts to the top
Currently we call `on_internal_error()` if `tri_compare()` throws
`marshal_exception`. Some compare paths however might go around
`tri_compare()` and call `abstract_type::compare()` directly. Move the
check there to cover these cases too.
Tests: dev
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200403162530.1175801-1-bdenes@scylladb.com>
This patch makes makes major compaction aware of time buckets
for TWCS. That means that calling a major compaction with TWCS
will not bundle all SSTables together, but rather split them
based on their timestamps.
There are two motivations for this work:
1. Telling users not to ever major compact is easier said than
done: in practice due to a variety of circumstances it might
end up being done in which case data will have a hard time
expiring later.
2. We are about to start working with offstrategy compactions,
which are compactions that work in parallel with the main
compactions. In those cases we may be converting SSTables from
one format to another and it might be necessary to split a single
big STCS SSTable into something that TWCS expects
With the motivation out of the way, let's talk about the implementation:
The implementation is quite simple and builds upon the previous patches.
It simply specializes the interposer implementation for regular compaction
with a table-specific interposer.
Fixes#1431
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Our resharding code is complex, since the compaction object has to keep
track of many output SSTables, the current shard being written.
When implementing TWCS streaming writers, we ran away from such
write-side complexity by implementing an interposer: the interposer
consumes the flat_mutation_reader stream, creating many different writer
streams. We can do a similar thing for resharding SSTables and have each
writer be guaranteed to contain keys for only a specific source shard.
As we do that, we can move the SSTable and sstable_writer information
to the compacting_sstable_writer object. The compaction object will no
longer be responsible for it and can be simplified, paving the way for
TWCS-major, which will go through an interposer as well.
Note that the compaction_writer, which now holds both the SSTable
pointer and the sstable_writer still needs to be optional. This is
because LCS (and potentially others) still want to create more than
one SSTable per source stream. That is done to guarantee that each
SSTable complies with the max_sstable_size parameter, which is
information available in the sstable_writer that is not present at
the level of the flat_mutation_reader. We want to keep it in the writer
side.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
The storage_proxy instances hold references to token_metadata ones and
leave unwaited futures continuing to its query_partition_key_range_concurrent
method.
The latter is called from do_query so it's not that easy to find
out who is leaking. Keep the tokens not freed for a while.
Fixes: #6093
Test: manual start-stop
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200402183538.9674-1-xemul@scylladb.com>
Speculative reader has more targets that needed for CL. In case there is
a digest mismatch the repair runs between all of them, but that violates
provided CL. The patch makes it so that repair runs only between
replicas that answered (there will be CL of them).
Fixes#6123
Reviewed-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200402132245.GA21956@scylladb.com>
distcc doesn't like the -x c++ flag, so create an empty.cc file for
this purpose and compile it.
Also drop the "=" from "--include=", which is also disliked by
distcc.
Message-Id: <20200402124312.48963-1-avi@scylladb.com>
This is similar to the timestamp based splitting writer, except
that it splits data based on the shard where the partition key
is supposed to be placed.
It is similar to the multishard_writer, in the sense that it
creates n streams for n shards, but it does not want to process
the streams in the owner shards. We want to use that in processes
like resharding where it is fine for a foreign shard to deal
with a mutation.
One option would be to augment the multishard_writer to optionally
achieve these properties, but having a separate splitter is both
simpler and faster.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
I am about to introduce a new splitter. Therefore, move parts of it
that are common to its own file.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
I am about to get rid of the _shard attribute in the compaction object,
as I will create different streams of writers for different shards.
In preparation for that, remove the arbitrary _shard reference. Raphael
confirms that resharding should never be calling this, as this method is
used exclusively for garbage collection component of run-based
compaction. Therefore we'll just throw in this case and remove the shard
reference.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
The shard parameter is ignored for SSTable creation on regular
compaction. It is still good practice and good future proofing
to pass something meaningful here instead of zero. This patch
passes the id of the current shard.
Thanks Botond for pointing that out.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200402122212.12218-1-glauber@scylladb.com>
"
This patchseries is part of my effort to make resharding less special -
and hopefully less problematic. The next steps are a bit heavy, so I'd
like to, if possible, get this out of the way.
After these two patches, there is no more need to ever call
reshard_sstables: compact_sstables will do, and it will be able to
recognize resharding compactions.
To do that we need to unify the creator function, which is trivially
done by adding a shard parameter to regular compactions as well: they
can just ignore it. I have considered just making the
compaction_descriptor have a virtual create() function and specializing
it, but because we have to store the creator in the compaction object I
decided to keep the virtual function for now.
In a later cleanup step, if we can for instance store the entire
compaction_descriptor object in the compaction object we could do that.
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Tests: unit tests (dev), dtest (resharding.py)
"
* 'resharding-through-compact-sstables' of github.com:glommer/scylla:
resharding: get rid of special reshard_sstables
compaction: enhance compaction_descriptor with creator and replace function
This reverts commit fdd2d9de3d because it
breaks one heat-weighted load balancing dtest:
FAIL: heat_weighted_load_balancing_cl_QUORUM_test (heat_weighted_load_balancing_test.HeatWeightedLB)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/home/penberg/src/scylla/scylla-dtest/heat_weighted_load_balancing_test.py", line 182, in heat_weighted_load_balancing_cl_QUORUM_test
self.run_heat_weighted_load_balancing('QUORUM')
File "/home/penberg/src/scylla/scylla-dtest/heat_weighted_load_balancing_test.py", line 165, in run_heat_weighted_load_balancing
self.verify_metrics(metrics, cached=False)
File "/home/penberg/src/scylla/scylla-dtest/heat_weighted_load_balancing_test.py", line 73, in verify_metrics
mean_avg, node_mean_avg, key))
AssertionError: 19.0 not found in range(3, 13) : Cache difference between nodes is less then expected: 6469.6/328.2, metric scylla_storage_proxy_coordinator_reads_local_node
I am reverting because it's a test issue, and we should bring this
commit back once the test is fixed.
Gleb Natapov explains:
"dtest result directly depends on replicas we contact. Glauber's patch
make us contacts less replicas, so numbers differ."
The "run" script for the Alternator tests needs to set a system table for
authentication credentials, so we can test this feature.
So far we did this with cqlsh, but cqlsh isn't always installed on build
machines. But install-dependencies.sh already installs the Cassandra driver
for Python, so it makes more sense to use that, so this patch switches to
use it.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200331131522.28056-1-nyh@scylladb.com>
To run Alternator tests, only two additional dependencies need to be added to
install-dependencies.sh: pytest, and python3-boto3. We also need
python3-cassandra-driver, but this dependency is already listed.
This patch only updates the dependencies for Fedora, which is what we need
for dbuild and our Jenkins setups.
Tested by building a new dbuild docker image and verifying that the
Alternator tests pass.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
[avi: update toolchain image; note this upgrades gcc to 9.3.1]
Message-Id: <20200330181128.18582-1-nyh@scylladb.com>
In order to be extra sure that we always generate proper
base partition/clustering keys from paging info when executing
an indexed query, additional checks are added - if any of them
triggers, an exception will be thrown.
Created in order to help debug an existing issue:
Refs #5856
Tests: unit(dev)
Due to c&p error cas_now_pruning counter is increased instead of
decreased after an operation completes. Fix it.
Fixes#6116
Message-Id: <20200401142859.GA16953@scylladb.com>
Always enable lightweight transactions. Remove the check for the command
line switch from the feature service, assuming LWT is always enabled.
Remove the check for LWT from Alternator.
Note that in order for the cluster to work with LWT, all nodes need
to support it.
Rename LWT to UNUSED in db/config.hh, to keep accepting lwt keyword in
--experimental-features command line option, but do nothing with it.
Changes in v2:
* remove enable_lwt feature flag, it's always there
Closes#6102
test: unit (dev, debug)
Message-Id: <20200401071149.41921-1-kostja@scylladb.com>
During bootstrap and replace operations the node can't take reads and
we'd like to see the process ending ASAP. This is because until the
process ends, we keep having to duplicate writes to an extended set. Not
to mention, in the case of a cluster expansion users want to use the
added capacity sooner rather than later.
Streaming generates a lot of compaction activity, that competes with the
bootstrap itself, slowing it down.
Long term, we are moving to treat those compactions differently and
maybe postpone them altogether. However for now we can reduce the amount
of compactions by increasing the minimum threshold of SSTables that have
to accumulate before they are selected for compactions. The default is
2, meaning we will trigger a compaction every time 2 SSTables of about
the same size are found (for STCS, others follow a similar pattern).
Until we have offstrategy infrastructure we don't want the compactions
to stop happening altogether so the reads, when they start, don't
suffer. This patch sets the minimum threshold to 16 (for the default
max_threshold of 32), meaning we will generate a lot less compaction
activity during streaming. Once streaming is done we revert it to its
original.
Unfortunately there isn't much we can do at the moment about decommission.
During decommission the nodes receiving data are also taking reads and
we don't want SSTables to accumulate.
Fixes#5109
Signed-off-by: Glauber Costa <glauber@scylladb.com>
dc_local_read_repair_chance is a legacy of old times: Cassandra itself
now defaults to zero, and we should look into that too.
Most serious production clusters are either repaired through our
asynchronous repair, or don't need repair at all.
Synchronous read repair can help things converging, but it implies an
impact at query time. For clusters that are on an asynchronous repair
schedule this should not be needed.
Fixes#6109
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200331183418.21452-1-glauber@scylladb.com>
There is a method, reshard_sstables(), whose sole purpose is to call a
resharding compaction. There is nothing special about this method: all
the information it needs is now present in the compaction_descriptor.
This patch extend the compaction_options class to recognize resharding
compactions as well, and uses that so that make_compaction() can also
create resharding compactions.
To make that happen we have to create a compaction_descriptor object in
the resharding method. Note however that resharding works by passing an
object very close to the compaction_descriptor around. Once this patch
is merged, a logical next step is to reuse it, and avoid creating the
descriptor right before calling compact_sstables().
Signed-off-by: Glauber Costa <glauber@scylladb.com>
There are many differences between resharding and compaction that are
artificial, arising more from the way we ended up implementing it than
necessity. This patch attempts to pass the creator and replacer functions
through the compaction_descriptor.
There is a difference between the creator function for resharding and
regular compaction: resharding has to pass the shard number on behalf
of which the SSTable is created. However regular compactions can just
ignore this. No need to have a special path just for this.
After this is done, the constructor for the compaction object can be
greatly simplified. In further patches I intend to simplify it a bit
further, but some more cleanup has to happen first.
To make that happen we have to construct a compaction_descriptor object
inside the resharding function. This is temporary: resharding currently
works with a descriptor, but at some point that descriptor is lost and
broken into pieces to be passed to this function. The overarching goal
of this work is exactly to be able to keep that descriptor for as long
as possible, which should simplify things a lot.
Callers are patched, but there are plenty for sstable_datafile_test.cc.
For their benefit, a helper function is provided to keep the previous
signature (test only).
Signed-off-by: Glauber Costa <glauber@scylladb.com>
"
Currently, both sharding and partitioning logic is encapsulated into partitioners. This is not desirable because these two concepts are totally independent and shouldn't be coupled together in such a way.
This PR separates sharding and partitioning. Partitioning will still live in i_partitioner class and its subclasses. Sharding is extracted to a new class called sharding_info. Both partitioners and sharding_info are still managed by schema class. Partitioner can be accessed with schema::get_partitioner while sharding_info can be accessed with schema::get_sharding_info.
The transition is done in steps:
1. sharding_info class is defined and all the sharding logic is extracted from partitioner to the new class. Temporarily sharding_info is still embedded into i_partitioner and all sharding related functions in i_partitioner call delegate to the embedded sharding_info object.
2. All calls to i_partitioner functions that are related to sharding are gradually switched to calls to sharding_info equivalents. sharding_info.
3. Once everything uses sharding_info, all sharding logic is dropped from i_partitioner.
Tests: unit(dev, release)
"
* haaawk-sharding_info: (32 commits)
dummy_sharder: rename dummy_sharding_info.* to dummy_sharder.*
sharding_info: rename the class to sharder
i_partitioner:remove embeded sharding_info
i_partitioner: remove unused get_sharding_info
schema: remove incorrect comment
schema: make it possible to set sharding_info per schema
i_partitioner: remove unused shard_count
multishard_writer: stop calling i_partitioner::shard_count
i_partitioner: remove sharding_ignore_msb
partitioner_test: test ranges and sharding_infos
i_partitioner: remove unused split_ranges_to_shards
i_partitioner: remove unused shard_of function
sstable-utils: use sharding_info::shard_of
create_token_range_from_keys: use sharding info for shard_of
multishard_mutation_query_test: use sharding info for shard_of
distribute_reader_and_consume_on_shards: use sharding_info::shard_of
multishard_mutation_query: use sharding_info::shard_of
dht::shard_of: use schema::get_sharding_info
i_partitioner: remove unused token_for_next_shard
split_range_to_single_shard: use sharding info instead of partitioner
...
Unfortunately, the boto3 library doen't allow us to check some of the
input error cases because it unnecessarily tests its input instead of
just passing it to Alternator and allowing Alternator to report the error.
In this patch we comment out a test case which used to work fine - i.e.,
the error was reported by Alternator - until recent changes to boto3
made it catch the problem without passing it to Alternator :-(
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200330190521.19526-2-nyh@scylladb.com>
One of the Alternator tests in test_tag.py checks the feature of creating
a table with a set of tags (as opposed to adding tags to an existing table).
This is a relatively new DynamoDB feature, only added in April 2019, so if
the botocore library is too old, it cannot test this feature, and we have to
skip the test.
Alternator developers should make an effort to keep the botocore library
up-to-date and test the latest DynamoDB features, but it is less important
if some test environments (like Jenkins) cannot verify this specific test
until its distro gets updated - it is more important that the fast majority
of the tests, which do not rely on very new features, get tested.
After this patch, if running on Fedora 30 with
python3-botocore-1.12.101-2.fc30.noarch installed, we get the following
skip message:
$ pytest-3 -rs test_tag.py
...
test_tag.py ..s..x [100%]
=================================================== short test summary info ===================================================
SKIP [1] /home/nyh/scylla/test/alternator/test_tag.py:114: Botocore version 1.12.136 or above required to run this test
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200330190521.19526-1-nyh@scylladb.com>
The learning stage of PAXOS protocol leaves behind an entry in
system.paxos table with the last learned value (which can be large). In
case not all participants learned it successfully next round on the same
key may complete the learning using this info. But if all nodes learned
the value the entry does not serve useful purpose any longer.
The patch adds another round, "prune", which is executed in background
(limited to 1000 simultaneous instances) and removes the entry in
case all nodes replied successfully to the "learn" round. It uses the
ballot's timestamp to do the deletion, so not to interfere with the
next round. Since deletion happens very close to previous writes it will
likely happen in memtable and will never reach sstable, so that reduces
memtable flush and compaction overhead.
Fixes#5779
Message-Id: <20200330154853.GA31074@scylladb.com>
Previously schema::get_sharding_info was obtaining
sharding_info from the partitioner but we want to remove
sharding_info from the partitioner so we need a place
in schema to store it there instead.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Previous patches have switched all the calls to
i_partitioner::shard_count to sharding_info::shard_count
and this function can now be removed.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Every place that has previously called this method is now
using sharding_info::sharding_ignore_msb and this function
can be removed.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Turn test_something_with_some_interesting_ranges_and_partitioners
into test_something_with_some_interesting_ranges_and_sharding_info.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Previous patches switched all the places that called
i_partitioner::shard_of to use sharding_info::shard_of
so i_partitioner::shard_of is no longer used and can
be removed.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Create sharding_info with the same parameters as
the partitioner and use it instead of the partitioner.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This patch replaces all the uses of i_partitioner:shard_of
with sharding_info::shard_of in read_context.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Previous patches have switched all the places that was
using i_partitioner::token_for_next_shard to
sharding_info::token_for_next_shard. Now the function
can be removed from i_partitioner.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The function relies only on i_partitioner::shard_count
and i_partitioner::token_fon_next_shard. Both are really implemented
in sharding_info so the method can use them directly.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
After previous patches that switched some tests to use sharding_info
instead of i_partitioner, we now don't need with_partitioner_for_tests_only
and the function can be removed.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
dummy_partitioner was renamed to dummy_sharding_info in
the previous patch. This patch cleans up the names of
files. It's done in a separate patch to not obstruct
the diff of previous patch.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Previously this function was accessing sharding logic
through partitioner obtained from the schema.
While converting tests, dummy_partitioner is turned into
dummy_sharding_info.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
sync_schema is supposed to make sure that this node knows about all
schema changes known by "nodes" that were made prior to this call.
Currently, when a node is down, the sync is sliently skipped.
To fix, add a flag to migration_task::run_may_throw to indicate that it
should fail if a node is down.
Fixes#4791
The value that is stored in "in_progress_ballot" cell is the value of
promised ballot, so call the cell accordingly to avoid confusion
especially as we have a notion of "in progress" proposal in the code
which is not the same as in_progress_ballot here.
We can still do it without care about backwards compatibility since LWT
is still marked as experimental.
Fixes#6087.
Message-Id: <20200326095758.GA10219@scylladb.com>
partitioner_test contains test_partitioner_sharding function
which this patch renames to test_sharding and makes it
use sharding_info instead of the partitioner.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
repair does not use partitioner and only uses sharding logic.
This means it does not have to depend on i_partitioner and can
instead operate on sharding_info.
This has an important consequence of allowing the repair of
multiple tables having different partitioners at the same time.
All tables repaired together still have to use the same
sharding logic.
To achieve this the change:
1. Removes partitioner field from repair_info
2. repair_info has access to sharding_info through schema
objects of repaired tables
3. partitioner name is removed from shard_config
4. local and remote partitioners are removed from repair_meta.
Remote sharding_info is used instead.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This method is not implemented anywhere not to mention the usage.
It is the only resonable thing to remove it instead of keeping
an unused and unimplemented declaration.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The class does not depend on partitioning logic but only uses
sharding logic. This means it is possible and desirable to limit its
dependency to only sharding_info.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
ring_position_range_vector_sharder does not depend on partitioning logic.
It only uses sharding logic so it is not necessary to store i_partitioner
in the class. Reference to sharding_info is enough.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
ring_position_range_sharder does not depend on partitioning at all.
It only uses sharding so it is enough for the class to take sharding_info
instead of a whole i_partitioner. This patch changes ring_position_range_sharder
class to contain const sharding_info& instead of const i_partitioner&.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
At the moment, we have a single sharding logic per node
but we want to be able to set it per table in the future.
To make it easy to change in the future sharding_info
will be managed inside schema and all the other code
will access it through schema::get_sharding_info function.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This patch creates a new class called sharding_info.
This new class will now be responsible for all
the sharding logic that before was a part of the partitioner.
In the end, sharding and partitioning logic will be fully
separated but this patch starts with just extracting sharding
logic to sharding_info and embedding it into i_partitioner class.
All sharding functions are still present in i_partitioner but now
they just delegate to the corresponding functions of the embedded
sharding_info object.
Following patches will gradually switch all uses of the following
i_partitioner member functions to their equivalents in sharding_info:
1. shard_of
2. token_for_next_shard
3. sharding_ignore_msb
4. shard_count
After that, sharding_info will be removed from i_partitioner and
the two classes will be totally independent.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The user migration_manager::submit_migration_task needs to know if
migration_task::run_may_throw is successful or not.
Do not swallow exception.
Fixes#4791
Make the constructor out-of-line and clean up includes made redundant.
This removes an include of Seastar's heavy reactor.hh from a header.
Ref #1
Message-Id: <20200329173711.16949-1-avi@scylladb.com>
* seastar c7b6b84e5...06a8c8f6e (12):
> scheduling_group_specific: remove inclusion of reactor.hh
> future: Delete void_futurize_helper
> future: Delete unused do_void_futurize_helper instantiation
> core: remove io_queue queued requests metric
> future: Add assert to set_urgent_state
> future: Add a comment to set_urgent_state
> future: Use placement new instead of operator= in set_urgent_state
> file: use correct io_queue in dup()d files
> io_queue: fix miscalculation of sizes when I/O queue is not configured.
> merge: Add log levels to RPC loggers
> reactor: Replace a call to cpu_id with this_shard_id()
> reactor: Drop a few redundant calls to engine()
The column-family is already looked up as the first line in the method.
No need to repeat that lookup in the lambda passed to
`run_when_memory_available()`, we can just capture the reference to the
already obtained column-family object. These objects are safe to
reference, they don't just disappear in the middle of an operation.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200327140827.128647-1-bdenes@scylladb.com>
Consider 3 nodes in the cluster, n1, n2, n3 with gossip generation
number g1, g2, g3.
n1, n2, n3 running scylla version with commit
0a52ecb6df (gossip: Fix max generation
drift measure)
One year later, user wants the upgrade n1,n2,n3 to a new version
when n3 does a rolling restart with a new version, n3 will use a
generation number g3'. Because g3' - g2 > MAX_GENERATION_DIFFERENCE and
g3' - g1 > MAX_GENERATION_DIFFERENCE, so g1 and g2 will reject n3's
gossip update and mark g3 as down.
Such unnecessary marking of node down can cause availability issues.
For example:
DC1: n1, n2
DC2: n3, n4
When n3 and n4 restart, n1 and n2 will mark n3 and n4 as down, which
causes the whole DC2 to be unavailable.
To fix, we can start the node with a gossip generation within
MAX_GENERATION_DIFFERENCE difference for the new node.
Once all the nodes run the version with commit
0a52ecb6df, the option is no logger
needed.
Fixes#5164
Most of Scylla code runs with a user-supplied query timeout, expressed as
absolute clock (deadline). When injecting test sleeps into such code, we most
often want to not sleep beyond the user supplied deadline. Extend error
injection API to optionally accept a deadline, and, if it is provided,
sleep no more than up to the deadline. If current time is beyond deadline,
sleep injection is skipped altogether.
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200326091600.1037717-2-alejo.sanchez@scylladb.com>
The previous patch made the LA format the default. We no longer need to
choose between writing the older KA format or LA, so the LA_SSTABLE
cluster feature has became unnecessary.
Unfortunately, we cannot completely remove this feature: Since commit
4f3ce42163 we cannot remove cluster features
because this node will refuse to join a cluster which already agreed on
features that it lacks - thinking it is an old node trying to join a
new cluster.
So the LA_SSTABLE feature flag remains, and we continue to advertise
that our node supports it. We just no longer care about what other
nodes advertised for it, so we can remove a bit of code that cared.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200324232607.4215-3-nyh@scylladb.com>
Over the years, Scylla updated the sstable format from the KA format to
the LA format, and most recently to the MC format. On a mixed cluster -
as occurs during a rolling upgrade - we want all the nodes, even new ones,
to write sstables in the format preferred by the old version. The thinking
is that if the upgrade fails, and we want to downgrade all nodes back to
the older version, we don't want to lose data because we already have
too-new sstables.
So the current code starts by selecting the oldest format we ever had - KA,
and only switching this choice to LA and MC after we verify that all the
nodes in the cluster support these newer formats.
But before an agreement is reached on the new format, sstables may already
be created in the antique KA format. This is usually harmless - we can
read this format just fine. However, the KA format has a problem that it is
unable to represent table names or keyspaces with the "-" character in them,
because this character is used to separate the keyspace and table names in
the file name. For CQL, a "-" is not allowed anyway in keyspace or table
names; But for Alternator, this character is allowed - and if a KA table
happens to be created by accident (before the LA or MC formats are chosen),
it cannot be read again during boot, and Scylla cannot reboot.
The solution that this patch takes is to change Scylla's default sstable
format to LA (and, as before, if the entire cluster agrees, the newer MC
format will be used). From now on, new KA tables will never be written.
But we still fully support *reading* the KA format - this is important in
case some very old sstables never underwent compaction.
The old code had, confusingly, two places where the default KA format
was chosen. This patch fixes is so the new default (LA) is specified in
only one place.
Fixes#6071.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200324232607.4215-2-nyh@scylladb.com>
* seastar 92c488706...c7b6b84e5 (6):
> semaphore: Use futurize_invoke instead of futurize_apply
> future: specify futurize::make_exception_future as noexcept
> future: Move ignore out of line
> future: Split then and then_impl to enable NRVO
> semaphore_units: allow getting the number of units held
> Merge "Split futurize::apply into invoke(...) and apply(tuple)" from Rafael
sync_schema is supposed to make sure that this node knows about all
schema changes known by "nodes" that were made prior to this call.
Currently, when a node is down, the sync is sliently skipped.
To fix, add a flag to migration_task::run_may_throw to indicate that it
should fail if a node is down.
Fixes#4791
Fixes#6073
The logic with pre/post image was tangled into looking at "rs"
and would cause pre-image info to be stored even if only post-image
data was enabled.
Now only generate keys (and rows for them) iff explicitly enabled.
And only generate pre-image key iff we have pre-image data.
Fixes#6070
When mutation splitting was added, non-atomic column assignments were broken
into two invocation of transform. This means the second (actual data assignment)
does not know about the tombstone in first one -> postimage is created as if
we were _adding_ to the collection, not replacing it.
While not pretty, we can handle this knowing that we always get
invoked in timestamp order -> tombstone first, then assign.
So we simply keep track of non-atomic columns deleted across calls
and filter out preimage data post this.
Added test cases for all non-atomics
Now that #3158 is fixed, we can move the consumer to its place after
the `compaction_mutation_state::start_new_page()` call. No need to keep
it as `std::unique_ptr<>`.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200310185147.207665-1-bdenes@scylladb.com>
"
This small series consists of several changes that aim to
reduce the number of shared_ptr's in cql3 code.
Also it contains a patch that makes CqlParser::query to return
std::unique_ptr<> instead of seastar::shared_ptr<>, which leads
to more understandable code and lays foundation for further
optimizations (e.g. possibly eliminating shared_ptr's in
`prepared_statement` and just moving raw statements in `prepare`
without copying them).
Tests: unit(dev, debug)
"
* 'feature/cql_cleanups_9' of https://github.com/ManManson/scylla:
cql3: return raw::parsed_statement as unique_ptr
cql3: de-pointerize arguments to some of CQL grammar rules and definitions.
cql3: make abstract_marker::make_in_receiver accept cref to column_specification
Fixes#5899
When terminating (closing) a segment, we write a trailing block
of zero so reader can have an empty region after last used chunk
as end marker. This is due to using recycled, pre-allocated
segments with potentially non-zero data extending over the point
where we are ending the segment (i.e. we are not fully filling
the segment due to a huge mutation or similar).
However, if we reach end of segment writing the final block
(typically many small mutations), the file will end naturally
after the data written, and any trailing zero block would in fact
just extend the file further. While this will only happen once per
segment recycled (independent on how many times it is recycled),
it is still both slightly breaking the disk usage contract and
also potentially causing some disk stalls due to metadata changes
(though of course very infrequent).
We should only write trailing zero if we are below the max_size
file size when terminating
Adds a small size check to commitlog test to verify size bounds.
(Which breaks without the patch)
v2:
- Fix test to take into account that files might be deleted
behind our backs.
v3:
- Fix test better, by doing verification _before_ segments are
queued for delete.
Message-Id: <20200226121601.15347-2-calle@scylladb.com>
Message-Id: <20200324100235.23982-1-calle@scylladb.com>
Change CQL parsing routine to return std::unique_ptr
instead of seastar::shared_ptr.
This can help reduce redundant shared_ptr copies even further.
Make some supplementary changes necessary for this transition:
* Remove enabled_shared_from_this base class from the following
classes: truncate_statement, authorization_statement,
authentication_statement: these were previously constructing
prepared_statement instance in `prepare` method using
`shared_from_this`.
Make `prepare` methods implementation of inheriting classes
mirror implementation from other statements (i.e.
create a shallow copy of the object when prepairing into
`prepared_statement`; this could be further refactored
to avoid copies as much as possible).
* Remove unused fields in create_role_statement which led to
error while using compiler-generated copy ctor (copying
uninitialied bool values via ctor).
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Make the following rules and definitions accept a reference
instead of shared_ptr's:
* cfamDefinition
* cfamColumns
* pkDef
* typeColumns
* ksName
* cfName
* idxName
* properties
* property
This will reduce a bit the number of countless shared_ptr copies
and moves all over the place in cql3 code.
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
These methods just extract some info out of
column_specification, so no need have another copy of
shared_ptr since it's not stored anywhere inside.
Transform abstract_marker::in_raw::make_in_receiver as well
following the call chain.
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/6030 from
Piotr Dulikowski:
Adds CDC-related metrics.
Following counters are added, both for total and failed operations:
Total number of CDC operations that did/did not perform splitting,
Total number of CDC operations that touched a particular mutation part.
Total number of preimage selects.
Fixes#6002.
Tests: unit(dev, debug)
* 'cdc-metrics' of github.com:piodul/scylla:
storage_proxy: track CDC operations in LWT flow
storage_proxy: track CDC operations in logged batches
storage_proxy: track CDC operations in standard flow
storage_proxy: add cdc tracker hooks to write response handlers
storage_proxy: move "else if" remainder into "else" block
cdc: create an operation_result_tracker object
cdc: add an object for tracking progress of cdc mutations
cdc: count touched mutation parts in transformer::transform
cdc: track preimage selects in metrics
cdc: register metric counters
cdc: fix non-atomic updates in splitting
Compaction automatically adds gc grace period to expiry times already,
no need to add it when creating the tombstones. Remove the redundant
additions form the code. The direct impact is really minor as this is
only used in tests, but it might confuse readers who are looking at how
tombstones are created across the codebase.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200323120948.92104-1-bdenes@scylladb.com>
Adds a field to abstract_write_response_handler that points to the cdc
operation result tracker, and a function for registering the tracker in
the handlers that currently write to a CDC log table.
CDC metrics, apart from tracking "total" metrics for all performed CDC
operations, also track metrics for "failed" operations. Because the
result of the CDC operation depends on whether all CDC mutations were
written successfully by storage_proxy, checking for failure and
incrementing appropriate counters is deferred after all write response
handlers finish.
The `cdc::operation_result_tracker` object was created for that purpose.
It contains all the details needed to accurately update the metrics
based on what actually happened in the `augment_mutation_call` function,
and holds a flag which tells if any of write response handlers failed.
This object is supposed to be referenced by write response handlers for
CDC mutations created after the same `augment_mutation_call`. After all
write response handlers are destroyed, the destructor of
`operation_result_tracker` will update appropriate metrics.
Actual creating and attaching this object to write response handlers
will be done in subsequent commits.
Modifies the transformer::transform so that it also returns a set of
flags indicating what parts of the mutation (e.g. rows, tombstones,
collections, etc.) were processed during transforming.
This patch defines a CDC metrics object and registers all of its
counters.
storage_proxy is chosen as the owner of the metrics object. Because in
subsequent commits it will become possible for CDC metrics to be updated
after a write operation ends, and because the cdc_service has shorter
lifetime than storage_proxy, we could risk a use-after-free if we placed
this object inside cdc_service.
This patch fixes a bug in mutation splitting logic of CDC. In the part
that handles updates of non-atomic clustering columns, the column
definition was fetched from a static column of the same id instead of
the actual definition of the clustering column. It could cause the value
to be written to a wrong column.
Tests: unit(dev)
This implements support for triggering major compations through the REST
API. Please note that "split_output" is not supported and Glauber Costa
confirmed this this is fine:
"We don't support splits, nor do I think we should."
Signed-off-by: Ivan Prisyazhnyy <ivan@scylladb.com>
"
Make sure all headers compile on their own, without requiring any
additional includes externally.
Even though this requirement is not documented in our coding guides it
is still quasi enforced and we semi-regularly get and merge patches
adding missing includes to headers.
This patch-set fixes all headers and adds a `{mode}-headers` target that
can be used to verify each header. This target should be built by
promotion to ensure no new non-conforming code sneaks in.
Individual headers can be verified using the
`build/dev/path/to/header.hh.o` target, that is generated for every
header.
The majority of the headers was just missing `seastarx.hh`. I think we
should just include this via a compiler flag to remove the noise from
our code (in a followup).
"
* 'compiling-headers/v2' of https://github.com/denesb/scylla:
configure.py: add {mode}-headers phony target
treewide: add missing headers and/or forward declarations
test/boost/sstable_test.hh: move generic stuff to test/lib/sstable_utils.hh
sstables: size_tiered_backlog_tracker: move methods out-of-line
sstables: date_tiered_compaction_strategy.hh: move methods out-of-line
* seastar 3c498abcab...92c488706c (14):
> dpdk: restore including reactor.hh
> tests: distributed_test: add missing #include <mutex>
> reactor: un-static-ify make_pollfn()
> merge: Reduce inclusions of reactor.hh
A few #includes added to compensate for this
> sharded: delete move constructor
> future: Avoid a move constructor call
> future: Erase types a bit more in then_wrapped
> memory: Drop a never nullopt optional
> semaphore: specify get_units and with_semaphore as noexcept
> spinlock.hh: Add include for <cassert> header
> dpdk: Avoid a variable sized array
> future: Add an explicit promise member to continuation
> net: remove smart pointer wrappers around pollable_fd
> Merge "cleanup reactor file functions" from Benny
This patch fixes a bug in mutation splitting logic of CDC. In the part
that handles updates of non-atomic clustering columns, the schema for
serializing that column was looked up incorrectly in the table schema -
instead of a `regular_column`, a `static_column` was looked up.
Due to how the `column_at` function works, a correct schema was always
returned if the table had no static columns. Therefore, in order for
this bug to manifest, a table with a static column and a regular column
with non-atomic collection was needed.
Closes#3295
The error_injection class allows injecting custom handlers into normal control
flow at the pre-determined injection points.
This is especially useful in various testing scenarios:
* Throwing an exception at some rare and extreme corner-cases
* Injecting a delay to test for timeouts to be handled correctly
* More advanced uses with custom lambda as an injection handler
Injection points are defined by `inject` calls.
Enabling and disabling injections are done by the corresponding
`enable` and `disable` calls.
REST frontend APIs is provided for convenience.
Branch URL: https://github.com/alecco/scylla/tree/as_error_injection
Tests: unit {{dev}}, unit {{debug}}
* 'as_error_injection' of github.com:alecco/scylla:
api: add error injection to REST API
utils: add error injection
sstable_test.hh started as collection of utilities shared between the
various `_sstable_test.cc` files. Predictably other tests started using
it as well, among them some that are non boost unit tests. This poses a
problem as if we add the missing boost/test/unit_test.hpp include to
sstable_test.hh these tests will suddenly have missing symbols from
boost::test. To avoid linking boost::test into all these users, extract
utilities more widely used into sstable_utils.hh
We now have a utils file for SSTables. This is potentially useful for
other tests.
As a matter of fact, this function is repeated right now for the
resharding test. And to add insult to injury, the version in the
resharding test has the parameters shard and number of tokens flipped,
which although extremely confusing is the predictable outcome of
such repetition
Signed-off-by: Glauber Costa <glauber@scylladb.com>
The following error was seen in
materialized_views_test.py:TestMaterializedViews.decommission_node_during_mv_insert_4_nodes_test
INFO [shard 0] repair - repair id 3 to sync data for
keyspace=ks, status=started repair/repair.cc:662:36: runtime error: member call
on null pointer of type 'const struct schema'
Aborting on shard 0.
The problem is in the test a keyspace was created without creating any
table. Since db19a76b1f(selective_token_range_sharder: stop calling
global_partitioner()), in get_partitioner_for_tables, we access nullptr
when no table is present.
schema_ptr last_s;
for (auto t: tables) {
// set last_s
}
last_s->get_partitione()
To fix:
1) Skip the repair in sync_data_using_repair if there is no table in the keyspace
2) Throw if no schema_ptr is found in get_partitioner_for_tables. Be defensive.
After:
INFO [shard 0] repair - decommission_with_repair: started with keyspace=ks, leaving_node=127.0.0.2, nr_ranges=744
INFO [shard 0] repair - repair id 3 to sync data for keyspace=ks, status=started
WARN [shard 0] repair - repair id 3 to sync data for keyspace=ks, no table in this keyspace
INFO [shard 0] repair - repair id 3 completed successfully
INFO [shard 0] repair - repair id 3 to sync data for keyspace=ks, status=succeeded
Tests: materialized_views_test.py:TestMaterializedViews.decommission_node_during_mv_insert_4_nodes_test
Fixes: #6022
repair: Ignore keyspace that is removed in sync_data_using_repair
When a keyspace is removed during node operations, we should not fail
the whole operation. Ignore the keyspace that is removed.
Fixes#5942
* asias-repair_fix_5942:
repair: Stop the nodes that have run repair_row_level_start
repair: Ignore keyspace that is removed in sync_data_using_repair
Seems like adduser in redhat variants and deiban variants are incompatible,
and there is no addgroup in redhat variants.
Since adduser in install.sh is implemented on debian variants, does not work on redhat compatible.
To fix this we need to use 'useradd' / 'groupadd' instead.
Fixes#6018
"
In debug mode some tests take veeery looong time to finish,
those tests are better to be started first. This set adds
this by marking such long tests in suite.yaml files.
Tests: unit(dev)
"
* 'br-split-unit-tests-sorting-2' of https://github.com/xemul/scylla:
test.py: Mark some tests as "run_first"
test.py: Generate list with short names
test.py: Rename "long" to "skip_in_debug_mode"
Primary key and clustering key column should not have a corresponding
"cdc$deleted_<name>" column in cdc log table, because it does not make
sense to delete such a column from a row.
Fixes: #6049
Tests: unit(dev)
This reverts commit 0b34d88957. According
to Rafael Avila de Espindola:
"I have bisected the recent failures [in commitlog_test] on next to this
patch."
This reverts commit 458ef4bb06. According
to Glauber Costa:
"It may give us the illusion that fixes something for a particular case
but this fix is wrong.
I am trying to help Raphael figure out why the backlog is wrong but
this patch is not the answer."
Simple REST API for error injection is implemented.
The API allow the following operations:
* injecting an error at given injection name
* listing injections
* disabling an injection
* disabling all injections
Currently the API enables/disables on all shards.
Closes#3295
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Error injection class is implemented in order to allow injecting
various errors (exceptions, stalls, etc.) in code for testing
purposes.
Error injection is enabled via compile flag
SCYLLA_ENABLE_ERROR_INJECTION
TODO: manage shard instances
Enable error injection in debug/dev/sanitize modes.
Unit tests for error injection class.
Closes#3295
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Running the Alternator tests is easy after you manually run Scylla, but
sometimes it's convenient to have a script which just does everything
automatically: start Scylla in a temporary directory, set it up properly
for the tests (especially the authentication), run all the tests, and remove
the temporary directory. This is what this alternator-tests/run script does.
This script can be run by Jenkins, for example, to check all the Alternator
tests. The script assumes some things (including cqlsh, pytest and the boto3
library) are already installed, and that Scylla has been compiled - by
default it takes the latest built build/*/scylla, but this can be overridden
by a command like
SCYLLA=build/release/scylla alternator-test/run
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200311091918.16170-1-nyh@scylladb.com>
Currently the initial vertice of the graph is resolved in both
`_traverse_object_graph_breadth_first()` and its caller
`_do_generate_object_graph()`. This is redundant, so remove the
resolving in the latter.
Currently, for edges, only the "from" offset is printed, that is the
offset of the reference in the originating object. Now that we also scan
the non-first word of objects for references to them, we can have
reference pointing to the non-first word of objects. To make these
apparent, also print the "to" offset on edges, that is the offset into
the target object where the reference point to. So now edges have tuple
labels: (from, to).
Merged patch series from Piotr Sarna:
This series hooks alernator to admission control, similarly to how
CQL server uses it. The estimated memory consumption is set to 2x
raw JSON request, since that seems to be the upper limit of
how much more memory rapidjson allocates during parsing.
Note, that since Seastar HTTP currently reads the whole contents
upfront, there's no easy way to apply admission control before reading
the request - that would involve some changes to our HTTP API.
Note 2: currently, admission control in CQL does not properly pass
memory consumption information for requests that are bounced
to another shard - that would require either transferring semaphore units
between shards or keeping a foreign pointer to the original units.
As a result, alternator also does not pass correct admission control
info between shards, and all places in code which do that are marked
with clear FIXMEs.
Fixes#5029
Piotr Sarna (5):
storage_service: add memory limiter semaphore getter
alternator: add service permit to callbacks
alternator: add memory limiter to alternator server
alternator: add addmission control stats entry
alternator: hook admission control to alternator server
alternator/executor.cc | 113 ++++++++++++++++++++++--------------
alternator/executor.hh | 32 +++++-----
alternator/rmw_operation.hh | 1 +
alternator/server.cc | 83 +++++++++++++++-----------
alternator/server.hh | 8 ++-
alternator/stats.cc | 2 +
alternator/stats.hh | 1 +
main.cc | 3 +-
service/storage_service.hh | 4 ++
9 files changed, 149 insertions(+), 98 deletions(-)
Before this patch, when db/view/view.hh was modified, 89 source files had to
be recompiled. After this patch, this number is down to 5.
Most of the irrelevant source files got view.hh by including database.hh,
which included view.hh just for the definition of statistics. So in this
patch we split the view statistics to a separate header file, view_stats.hh,
and database.hh only includes that. A few source files which included
only database.hh and also needed view.hh (for materialized-view related
functions) now need to include view.hh explicitly.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200319121031.540-1-nyh@scylladb.com>
When looking for references to an object in the graph, look for
references to any part of the object, using `scylla_find.find()`:s new
`value_range` parameter.
This way, the graph can be extended beyond objects that are members of
an intrusive containers, or just generally don't have any references to
their very first byte.
Allow the user to specify a value-range different than the size of the
object. This is useful if it is known that references to the object will
point to the first N bytes.
One of the most common use-cases of find is finding references to an
object. This works great for normal objects, however not for all of
them, a prominent example being objects that are members of an intrusive
collections. These objects will have pointers to them that don't point
to their first byte, instead they point to somewhere in the middle of
the object. To help find such references, find now supports searching
for a range of values. If the new `--value-range` option is used, it
will start searching for the value itself, and if no usages are found it
will increment it with the specified size-class, and search again. This
is repeated until some usages are found or the range is depleted.
`scylla_find.find()` now returns the offset to the value, of which
usages were found. Alternatively one can scan the entire value-range
using the `--find-all` option. When this is used, `scylla_find` will not
stop on the first offset for which references are found.
find_in_live() currently parses back the output of `scylla ptr`, to
return the address to the beginning of the object and the offset. All
its current callers do the call to `scylla ptr` again to obtain further
information about the object. To avoid this duplicated effort, return
`pointer_metadata` instances from `find_in_live()`, obtained via
`scylla_ptr.analyze()` which is the python API to `scylla ptr`.
The `result_callback` was a callback returned by `augment_mutation_call`
that was supposed to be used in the CDC postimage implementation.
Because CDC postimage was implemented without using this callback, and
currently a no-op function is always returned, this callback can safely
be removed.
Those tests take long time to finish, so it makes sense to start
them earlier than others.
The provided list of long tests consists of those running more
than 10 minutes in debug mode.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
It returns a future, so converting an exception to an exceptional
future simplifies error handling in the caller.
Without this code like the one in
standard_role_manager::create_metadata_tables_if_missing has a
surprising behavior:
return when_all_succeed(
create_metadata_table_if_missing(...),
create_metadata_table_if_missing(...));
Since it might not wait for both futures. We could use the lambda
version of when_all_succeed, but changing
create_metadata_table_if_missing seems a nice API improvement.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200317002051.117832-4-espindola@scylladb.com>
View updates sent as part of the view building process should never
be ignored, but fd49fd7 introduced a bug which may cause exactly that:
the updates are mistakenly sent to background, so the view builder
will not receive negative feedback if an update failed, which will
in turn not cause a retry. Consequently, view building may report
that it "finished" building a view, while some of the updates were
lost. A simple fix is to restore previous behaviour - all updates
triggered by view building are now waited for.
Fixes#6038
Tests: unit(dev),
dtest: interrupt_build_process_with_resharding_low_to_half_test
The "long" test will mean that it is to be started first, not
skipped, so rename "long" to avoid additional confusion
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
When qualifying columns to be fetched for filtering, we also check
if the target column is not used as an index - in which case there's
no need of fetching it. However, the check was incorrectly assuming
that any restriction is eligible for indexing, while it's currently
only true for EQ. The fix makes a more specific check and contains
many dynamic casts, but these will hopefully we gone once our
long planned "restrictions rewrite" is done.
This commit comes with a test.
Fixes#5708
Tests: unit(dev)
The intention of the code was to clear sharding metadata
chunked_vector so that it doesn't bloat memory.
The type of c is `chunked_vector*`. Assigning `{}`
clears the pointer while the intended behavior was to reset the
`chunked_vector` instance. The original instance is left unmodified
with all its reserved space.
Because of this, the previous fix had no effect because token ranges
are stored entirely inline and popping them doesn't realease memory.
Fixes#4951
Tests:
- sstable_mutation_test (dev)
- manual using scylla binary on customer data on top of 2019.1.5
Reviewed-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <1584559892-27653-1-git-send-email-tgrabiec@scylladb.com>
An exception thrown after the start of auth_service and before
init_server_without_the_messaging_service_part returns would cause the
sharded<auth_service> destructor to assert.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200317002051.117832-2-espindola@scylladb.com>
When investigating OOM related cores, a common thing to do is trying to
identify the objects in a particularly heavily populated size-class.
This command is meant to help with that, providing a way to list the
objects in any size-class, in a paginated way.
Traversing the objects of a pool is done through a
`small_object_iterator` object which is also exposed to python code, to
be used in custom scripts wanting to scan all objects belonging to a
pool.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200318085437.452906-1-bdenes@scylladb.com>
If delete_atomically() was called with a empty set for any reason,
it will fail to work because it relies on any of the sstables in
the set for getting the sstable directory.
This will be needed, in the future, when using sstable replacement
function only with new sstables.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200305144657.9440-1-raphaelsc@scylladb.com>
There's such an option, and it's not taken into account
on scylla start. There's a symmetrical start_rpc one, which
is, so make both act similarly.
The default value for the option is true, so default set-ups
will not get broken.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200310140518.29410-1-xemul@scylladb.com>
"
Since b783d40aa storage-proxy maintains separate coordinator stats per
scheduling group. This broke scylla_memory, which was still trying to
access the old global stats. This mini-series updates it to be able to
handle per-sg coordinator stats, while preserving backward compatibility
with older versions still using global stats.
"
* 'scylla-memory-per-sg-coordinator-stats/v1' of https://github.com/denesb/scylla:
scylla-gdb.py: scylla_memory: update w.r.t. per-sg coordinator stats
scylla-gdb.py: scylla_memory: move coordinator code to print_coordinator_stats()
"
The debug mode unit tests take ~half-an-hour to complete. Here's
the tests run-times top list
Test: Time (seconds):
... steady tail goes here ...
test/boost/user_function_test 496
test/boost/row_cache_test 502
test/boost/view_schema_test 932
test/boost/cql_query_test 997
test/boost/mutation_reader_test 1048
test/boost/sstable_mutation_test 1417
test/boost/secondary_index_test 1468
Splitting the spike (top-5) is the primary goal. However, the
distribution of test-cases in 3 of those tests is also _very_
non-uniform, so just cutting it into equal parts doesn't work.
For example, the test_index_with_paging from the slowest one
takes ~14 minutes on its own and is the slowest test-case out
there.
So the set does this:
- moves the champion test_index_with_paging into separate file
- detaches the most heavy parts from sstable_mutation_test and
mutation_reader_test into own tests too. The resulting split
is still non-uniform, but it's 4 tests that run notably less
than the 14 minutes record each
- splits the cql_query_test and view_schema_test into several
parts in a wildcard manner to run out of the 14 min threshold
- moves some shared code into lib/
As the result, the debug mode test run takes 14.5 minutes =)
which is almost 2 times faster than it was. The dev mode run
time is not affected noticeably.
Test: well, unit(debug) and unit(dev)
"
* 'br-split-unit-tests-3-next' of https://github.com/xemul/scylla:
test: Split view_schema_test
test: Split cql_query_test
test: Split mutation_reader_test
test: Split sstable_mutation_test
test: Split secondary_index test
Consider
1. Start n1, n2 in the cluster
2. Stop n2 and delete all data for n2
3. Start n2 to replace itself with replace_address_first_boot: n2
4. Kill n2 before n2 finishes the replace operation
5. Remove replace_address_first_boot: n2 from scylla.yaml of n2
6. Delete all data for n2
7. Start n2
At step 7, n2 will be allowed to bootstrap as a new node, because the
application state of n2 in the cluster is HIBERNATE which is not
rejected in the check of is_safe_for_bootstrap. As a result, n2 will
replace n2 with a different tokens and a different host_id, as if the
old n2 node was removed from the cluster silently.
Fixes#5172
Check that SELECT statement checks there is a partition key before
accessing it when determining the shard to execute the query on.
Essentially move the check for properly restricted partition key
from storage_proxy.cc to select_statement.cc, now that we access
it earlier in the call stack.
Keep the check in storage_proxy.cc since storage_proxy::query()
has other call sites (views), which today should never use
serial consistency for its queries, but this can change in the future.
Please note that Cassandra only partially enforce SERIAL consistency
and can silently downgrade SERIAL consistency to the default
non-serial one when doing unbounded SELECTS (
https://issues.apache.org/jira/browse/CASSANDRA-15641)
Fixes#6016
Detach *partition_key* and *clustering_key* ones into own files.
The resultint 2 tests run ~4 minutes each, the leftover ones
complete within 11 minutes. The same -- the goal to run out of
14 minutes is reached, further splitting needs more thinking
than just wildcarding.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This detaches *like_operator*, *group_by*, *functions*
and *large* cases into own files. The split is not
uniform -- the resulting 4 tests run less that 3 minutes
each, what's left in the origin runs ~11 minutes. But
since the goal was to get out of 14 minutes threshold
and this file contains 126 cases (the champion) so I
just did "wildcard" selection that worked.
It also required moving require_rows() helpers into a
local header.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Detach test_multishard_combining_reader_as_mutation_source into
individual file.
This particular test runs ~13 minutes. What's left in the origin
completes a bit faster.
The split also requires moving the reader_lifecycle_policy and
the dummy_partitioner into lib/
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Detach test_schema_changes and test_sstable_conforms_to_mutation_source
into individual files. These two take ~10 minutes each, what's left in
origin finishes within 4 minutes alltogether.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Detach test_index_with_paging into individual file.
This particular test-case is the longest one in the sute,
it takes ~14 minutes to run, further splitting of this
test is pointless (for now) and all subsequent splits in
this set just make the resulting times less than this.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The call for merge_schema_from in some cases is run in the
background and thus is not aborted/waited on shutdown. This
may result in use-after-free one of which is
merge_schema_from
-> read_schema_for_keyspace
-> db::system_keyspace::query
-> storage_proxy::query
-> query_partition_key_range_concurrent
in the latter function the proxy._token_metadata is accessed,
while the respective object can be already free (unlike the
storage_proxy itself that's still leaked on shutdown).
Related bug: #5903, #5999 (cannot reproduce though)
Tests: unit(dev), manual start-stop
dtest(consistency.TestConsistency, dev)
dtest(schema_management, dev)
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Reviewed-by: Pekka Enberg <penberg@scylladb.com>
Message-Id: <20200316150348.31118-1-xemul@scylladb.com>
"
Allow adding compacting to any reader pipeline. The intended users are
streaming and repair, with the goal to prevent wasting transfer
bandwidth with data that is purgeable.
No current user in the tree.
Tests: unit(dev), mutation_reader_test.compacting_reader_*(debug)
"
* 'compacting-reader/v3' of https://github.com/denesb/scylla:
test: boost/mutation_reader_test: add unit test for compacting_reader
test: lib/flat_mutation_reader_assertions: be more lenient about empty mutations
test: lib/mutation_source_test: make data compaction friendly
test: random_mutation_generator: add generate_uncompactable mode
mutation_reader: introduce compacting_reader
When expecting a mutation that compacts to an empty one, allow it to be
not produced at all. After all, compaction normally doesn't even emits
empty partitions.
Currently the mutation source test suite may generate data that is
compactable. This poses a problem for the next patch, where we want to
use it to test `compacting_reader` a reader which compacts data as it
reads it. When the input is compactable, this will introduce artificial
differences, failing the tests.
To allow also testing such readers, make sure data is not compactable,
i.e. compacting it will not change it.
The goal of the mutation source test suite is not to exercise compaction
logic, so this will not take anything away from its value.
The random mutation generator currently generates data and tombstones
with random timestamps selected from a pre-determined range. This
results in mutations where tombstones often cover each other and data.
There is nothing wrong with this, as this is how real data is too.
However for certain tests this is problematic, as compacting the
mutations will result in a different mutations. To cater for these users
too, introduce a `generate_uncompactable` option. When set to `yes`, the
generated mutations will be uncompactable, i.e. no tombstone will cover
lower-level tombstones and no tombstone will cover data. The mutations
will not change after compacted.
Compacting reader compacts the output of another reader on-the-fly.
Performs compaction-type compaction (`compact_for_sstables::yes`).
It will be used in streaming and repair to eliminate purgeable data from
the stream, thus prevent wasting transfer bandwidth.
Merged pull request https://github.com/scylladb/scylla/pull/5996 from
Calle Wilund:
Fixes#4992
Implements post-image support by synthesizing it from
pre-image + delta.
Post-image data differs from the delta data in two ways:
1.) It merges non-atomics into an actual result value
2.) It contains all columns of the row, not just
those affected by the update.
For a non-atomic field, the post-image value of a column
is either the pre-image or the delta (maybe null)
Tested by adding post-image checks to pre-image test
and collection/udt tests
Fixes#4992
Implements post-image support by synthesizing it from
pre-image + delta.
Post-image data differs from the delta data in two ways:
1.) It merges non-atomics into an actual result value
2.) It contains _all_ columns of the row, not just
those affected by the update.
For a non-atomic field, the post-image value of a column
is either the pre-image or the delta (maybe null)
Tested by adding post-image checks to pre-image test
and collection/udt tests
"
This PR makes it possible to enable the usage of different partitioner for each table. If no table-specific partitioner is set for a given table then a default partitioner is used.
The PR is composed of the following parts:
- Introduction of schema::get_partitioner that still returns dht::global_partitioner
- Replacement of all the usage of dht::global_partitioner with schema::get_partitioner
- Making it possible to set table-specific partitioner in a schema_builder
- Remove all the places that were setting default partitioner except for main.cc (mostly tests)
- Move default partitioner from i_partitioner to schema.cc and hide it from the rest of the codebase
- Remove dht::global_partitioner
After this PR there's no such thing as global partitioner at all. There is only a default partitioner but it still has to be accessed through schema::get_partitioner.
There are some intermediate states in which i_partitioner is stored as shared_ptr in the schema but the final version keeps it by const&.
The PR does not enable per table partitioner end-to-end. Just the internals of the single node are covered. I still have to deal with:
- Making sure a table has the same partitioner on each node
- Allowing user to set up a table-specific partitioner on table
- Signal driver about what partitioner is used by a given table
- Persist partitioner info for each table that does not use default partitioner.
Fixes#5493
Tests: unit(dev, release, debug), dtest(byo)
"
* 'per_table_partitioner' of https://github.com/haaawk/scylla:
schema: drop optional from _partitioner field
make_multishard_combining_reader: stop taking partitioner
split_range_to_single_shard: stop taking partitioner as argument
tests: remove unused murmur3 includes
partitioner: move default_partitioner to schema.cc
partitioner: hide dht::default_partitioner
schema: include partitioner name in scylla tables mutation
schema: make it possible to set custom partitioner
scylla_tables: add partitioner column
schema_features: add PER_TABLE_PARTITIONERS feature
features: add PER_TABLE_PARTITIONERS feature
* seastar 47d929dd1...3c498abca (5):
> reactor: Use do_with to save stack space
> reactor: Extract code into a schedule_retry helper
> reactor: Move an io_event buffer out of the stack
> temporary_buffer: fix typo in argument type in comparison operators
> tests: tls_test: add missing include <iostream>
This adds a warning with a different limit in each mode. The limit is
picked as 1KiB lower than the value where no warning would be print.
This makes it easy to spot the worse offender. With that we can either
fix it or silence the warning once we are sure we can handle large
frames in that context.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200311205300.324383-1-espindola@scylladb.com>
The bug is that we failed to implement this part of the formula:
(T - C) * log4(T)
We were incorrectly implementing it as:
(T - C) * log4(T - C)
So it could result in a backlog being calculated as negative when it
should actually be positive, or backlog being lower than expected.
BTW, we do protect against negative backlog after commit 3e08bd17f0.
Given that STCS backlog tracker is inherited by TWCS and LCS trackers,
all compaction strategies are affected.
The formula to calculate the aggregate backlog is:
A = (T - C) * log4(T) - Sum(i = 0...N) { (Si - Ci)* log4(Si) }.
For example, negative backlog is calculated on a tested scenario where T
was 3129, C was 2337 and Sum(i = 0...N) { (Si - Ci)* log4(Si) } resulted
in 4222.53.
(T - C) * log4(T - C) = (3129 - 2337) * log4(3129 - 2337) = 3813.23
So backlog is negative because A = 3813.23 - 4222.53 = -409.302
But it should actually be calculated as follow:
(T - C) * log4(T) = (3129 - 2337) * log4(3129) = 4598.15
And the correct backlog is positive, as A = 4598.15 - 4223.53 = 375.621
Fixes#6021.
tests: unit(dev)
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200315153711.23302-1-raphaelsc@scylladb.com>
When the user performed
alter ks.t with compaction = {...}
the values of most other options, which were not specified in the
statement, e.g. compression, were left unchanged. That wasn't true for
extension options however: for example, the "cdc" option was removed.
This commit fixes the behavior to keep the old values of extension
options not specified in the alter statement.
The function already takes schema so there's no need
for it to take partitioner. It can be obtained using
schema::get_partitioner
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Remove last usage of this global outside i_partitioner.cc
and hide it inside the compilation unit.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
There are two results of this patch:
1. New partitioner name column is persited on node's disk in scylla_tables
2. New partitioner name column is included into schema digest
This is achieved by including this new column in scylla tables mutation.
For that we:
1. Add partitioner name to the result of make_scylla_tables_mutation.
If table does not have a specific partitioner set and uses default
partitioner then we don't include the name of such default partitioner.
Only the name of custom partitioner is added if a table has one.
2. In create_table_from_mutations we check whether scylla tables mutation
has a partitioner name set. If so then we use it as a parameter for
schema_builder.
Note that previous patches have ensured that this new column will be included
into schema digest only after the whole cluster supports per table partitioners.
Before that, during rolling upgrade, new partitioner name column is hidden and
not shared with other nodes.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
schema_builder::with_partitioner can be used now to
set custom partitioner on a table.
If no such partitioner is set, global partitioner is
still used.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Following commits make it possible to set a specific
partitioner for a table. We want to persist that information
and include it into schema digest. For that a new column
in scylla_tables is needed. This commit adds such column.
We add the new column to scylla_tables because it's a Scylla
specific extension.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
With per table partitioners, partitioner name will be a part
of table schema. To allow rolling upgrade we need to perform
special logic that hides new partitioner name schema column
during the upgrade. This commit adds new schema feature that
controls this logic.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This new feature is required because we now allow
setting partitioner per table. This will influence
the digest of table schema so we must not include
partitioner name into the digest unless we know that
the whole cluster already supports per table partitioners.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Subscript operation `__getitem__()` was only added to re.match objects
in 3.6. To support previous versions, use `groups()` method to obtain
the desired group.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200313160025.319464-1-bdenes@scylladb.com>
While CQL does not allow creation of a materialized view with more than one
base regular column in the view's key, in Alternator we do allow this - both
partition and clustering key may be a base regular column. We had a bug in
the logic handling this case:
If the new base row is missing a value for *one* of the view key columns,
we shouldn't create a view row. Similarly, if the existing base row was
missing a value for *one* of the view key columns, a view row does not
exist and doesn't need to be deleted. This was done incorrectly, and made
decisions based on just one of the key columns, and the logic is now
fixed (and I think, simplified) in this patch.
With this patch, the Alternator test which previously failed because of
this problem now passes. The patch also includes new tests in the existing
C++ unit test test_view_with_two_regular_base_columns_in_key. This tests
was already supposed to be testing various cases of two-new-key-columns
updates, but missed the cases explained above. These new tests failed
badly before this patch - some of them had clean write errors, others
caused crashes. With this patch, they pass.
Fixes#6008.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200312162503.8944-1-nyh@scylladb.com>
"
It doesn't look like we will be able to switch to std::string just
yet, but when it is not too inconvenient we can try to reduce our
dependence so that attempting the switch again in the future is
easier.
"
* 'espindola/sstring-api' of https://github.com/espindola/scylla:
redis: Use scattered_message::append(std::string_view)
everywhere: Use uninitialized_string instead of sstring::initialized_later
compressor: Add an explicit cast to const sstring&
everywhere: Be more explicit that we don't want std::make_shared
cql3: Don't use sstring::reset
everywhere: Don't assume sstring::begin() and sstring::end() are pointers
make_directory_for_column_family() is used in a parallel_for_each() in
parse_system_tables(). Because parallel_for_each does not preempt
in the initial execution of its input function, and because each thread
allocates 128k for the stack, we end up allocating many hundreds of
megabytes if there are many tables.
This happens early during boot and will only cause problems if
there are 5,000 tables per gigabyte of shard memory, and unlikely
combination that will probably fail later, but still it is better to
avoid unnecessary large allocations.
This was developed in order to fix#6003, until it was discovered that
c020b4e5e2 ("logalloc: increase capacity of _regions vector
outside reclaim lock") is the real fix.
Message-Id: <20200313093603.1366502-1-avi@scylladb.com>
In test_services.cc there is
gms::feature_service test_feature_service;
And the feature_service constructor has
, _lwt_feature(*this, features::LWT)
But features::LWT is a global sstring constructed in another file.
Solve the problem by making the feature strings constexpr
std::string_view.
I found the issue while trying to benchmark the std::string switch.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Acked-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200309225749.36661-1-espindola@scylladb.com>
Since b783d40aa storage-proxy maintains separate coordinator stats per
scheduling group. This broke scylla_memory, which was still trying to
access the old global stats. Update it to print the new per-scheduling
group stats when they are available and the old global ones when not.
Scheduling groups for which all relevant metrics are 0 are omitted from
the printout to reduce noise.
In order to avoid the UI merge button which tends to
mess up commit authors, a simple script for pulling
a PR from GitHub is added.
Example usage:
$ git fetch; git checkout origin/next
$ ./scripts/pull_github_pr.sh 6007
Message-Id: <1fa79c8be47b5660fc24a81fc0ab381aa26d98af.1584014944.git.sarna@scylladb.com>
This patch adds a test, test_gsi.py::test_gsi_missing_attribute_3,
reproducing issue #6008. The issue is about a GSI with *two*
regular base columns becoming key columns in a view, and we have
a write failure when writing an item with one of these attributes
missing.
The test passes on DynamoDB, currently xfails on Alternator.
Refs #6008.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200312064131.16046-1-nyh@scylladb.com>
Recently, Materialized Views were modified (see issue #4365) so that local
view updates (when both base and view replicas are the same node) are
synchronous. In particular, when the view's partition key is the same as
the base table's, view writes are synchronous: A write now only returns
after CL copies of the view data have been written.
Alternator's LSI have exactly this case (same partition key as the base).
This makes strongly-consistent (CL=LOCAL_QUORUM) reads in Alternator work
correctly, so we update the documentation accordingly to no longer say
that we don't support this DynamoDB feature.
However unlike LSIs, for GSIs strongly-consistent reads are still not
supported, and should not be supported (they are also not supported by
DynamoDB). Such reads should generate an error. So this patch fixes this
too. A GSI test which tested that strongly consistent reads are forbidden,
which used to xfail, now passes so the patch removes the "xfail".
Finally, we can simplify the LSI tests by using consistent reads instead of
eventually-consistent reads with retries. Beyond simplifying the test, it's
also an opportunity to *use* strongly-consistent reads and make sure that
they work (while, as mentioned above, similar reads for GSIs are refused).
Fixes#5007
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200311170446.28611-1-nyh@scylladb.com>
* seastar 95f4277c16...664c911b4c (4):
> tls_test: Use uninitialized_string instead of initialized_later
> tls: Fix race and stale memory use in delayed shutdown
Fixes#5759 (maybe)
> tls: Re-enable TLS test and fix build+run
> tls: Set server name for client connection if available
Reclaim consults the _regions vector, so we don't want it moving around while
allocating more capacity. For that we take the reclaim lock. However, that
can cause a false-positive OOM during startup:
1. all memory is allocated to LSA as part of priming (2baa16b371)
2. the _regions vector is resized from 64k to 128k, requiring a segment
to be freed (plenty are free)
3. but reclaiming_lock is taken, so we cannot reclaim anything.
To fix, resize the _regions vector outside the lock.
Fixes#6003.
Message-Id: <20200311091217.1112081-1-avi@scylladb.com>
In theory the C++11 ABI should already have a size field but it does not
in the version of the C++ standard library shipped with scylla 2019.1.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200225162337.112582-1-bdenes@scylladb.com>
Currently if `startswith` is passed to `resolve()` it will
unconditionally try to match the resolved symbol name against it. This
will of course fail when the symbols fails to resolve and `name` is
`None`. Return early when this happens to prevent the unnecessary
prefix matching.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200310140918.88928-1-bdenes@scylladb.com>
The current method of obtaining the text range based on a known vptr
(`reactor::_backend`) was based on branch-2019.1, where
`reactor::_backend` is a value member. However in >=3.0
`reactor::_backend` is a `std::unique_ptr<>`. Adapt the code to work for
both.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200310135957.86261-1-bdenes@scylladb.com>
Merged patch series by Piotr Sarna:
This series makes view updates synchronous, as long as the update
is going to be applied locally.
With this feature, local secondary indexes and, more generally,
materialized views with partition keys same as in the base table
could enjoy more robust consistency.
This series comes with a cql test, not common for materialized
views, which usually require eventual consistency checks. With
synchronous updates however, the test can simply check view values
right after updating the base table.
Fixes#4365
Refs #5007
Tests: unit(dev), manually via inserting sleeps and debug messages,
to make sure that local view updates are actually waited for
Piotr Sarna (4):
db,view: drop default parameter for mutate_MV::allow_hints
db,view: move putting view updates to background to mutate_MV
db,view: perform local view updates synchronously
test: add a simple test for synchronous local view updates
With synchronous local view updates enabled, local materialized views
can be queried right after base table insertions, without the risk
of reading stale values.
Local view updates (updates applied to a local node,
without remote communication) are from now on performed
synchronously - which adds consistency guarantees, as a local
write failure will be returned to the client instead of being
silently ignored.
Currently, launching view updates as an asynchronous background job
is done via not waiting for mutate_MV() future in
table::generate_and_propagate_view_updates. That has a big downside,
since mutate_MV() handles *all* view updates for *all* views of a table,
so it's not possible to wait for each view independently.
Per-view granularity is required in order to implement synchronous
view updates of local views - because then we'll synchronously
wait for all views that write to a local node (due to having a matching
partition key with the base), while remote view updates will still
be sent asynchronously.
In order to do that, instead of not waiting for mutate_MV,
we do wait for it properly, but instead launch the asynchronous,
unwaited-for futures inside mutate_MV.
Effectively that means no changes for view updates so far - all updates
will be fired in the background. Later, another patch will introduce
a way to wait for selected updates to finish.
This is just a trivial wrapper over initialized_later when using
sstring, but also works when std::string is used.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Some difference on how exactly the operator== is declared for sstring
versus std::string requires this change if we convert from sstring to
std::string.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
If sstring is made an alias to std::string ADL causes std::make_shared
to be found. Explicitly ask for ::make_shared.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
If we switch to using std::string we have to handle begin and end
returning iterators.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
It may be not safe to move sharded services, so it will be prohibited in
the future seastar update. Remove all current cases where we do it.
Fixes#5814.
Message-Id: <20200301095423.GY434@scylladb.com>
If the feature service is stopped without enabling some features,
the latrer may end up with "broken promise" exception on futures
attached to the _pr promise. Fix this by switching the only user
of it onto 'listener' API and remove future-based one.
Tests: unit(debug), manual start-stop and aborted-start
"
The cql_configu is needed by storage_service to feed it to
thrift/transport servers. These servers, in turn, put the
config onto query_options. The final goal of this config
reference is the guts of query_processor (but currently it's
only used by restrictions)
This way is rather long and confusing. It seems more natural
to keep the cql_config on it's main "user" -- query processor.
This patch set does so. However, in order to push the config
into its current usage places a huge refactoring is needed --
most of the classes in cql3/statements and cql3/restrictions.
It's much more handy to contunue keeping it via query_options,
so the query_processor is equipped with the method to return
the reference on the config to those initializing query_options.
Tests: unit(debug)
"
* 'br-clean-client-services-from-cql-config-2' of https://github.com/xemul/scylla:
storage_service: Forget cql_config
transport: Forget cql_config
thrift: Forget cql_config
query_processor: Carry reference on cql_config
The is_log_for_some_table function incorrectly assumed that
database::find_schema would return a null pointer in case the queried
schema does not exist. This patch fixes that, and now this function
checks for existence of the schema using database::has_schema.
Tests: unit(dev)
It is ok to run repair_row_level_stop unconditionally. The node that
hasn't received the repair_row_level_start will simply return an error
that the repair_meta_id is not found. To avoid the unnecessary
repair_row_level_stop verb, we can stop the nodes have run
repair_row_level_start. This also makes the error message less
confusing.
For example:
Before:
INFO 2020-03-09 15:55:43,369 [shard 0] repair - repair id 1 on shard 0
failed: std::runtime_error (get_repair_meta: repair_meta_id 8 for
node 127.0.0.4 does not exist)
INFO 2020-03-09 15:55:43,369 [shard 0] repair - repair id 1
failed: std::runtime_error ({shard 0: std::runtime_error
(get_repair_meta: repair_meta_id 8 for node 127.0.0.4 does not
exist)})
WARN 2020-03-09 15:55:43,369 [shard 0] repair - repair id 1 to
sync data for keyspace=ks, status=failed, keyspace does not exist
any more, ignoring it: std::runtime_error ({shard 0:
std::runtime_error (get_repair_meta: repair_meta_id 8 for node
127.0.0.4 does not exist)})
After:
INFO 2020-03-09 16:09:09,217 [shard 0] repair - repair id 1 on shard 0 failed:
std::runtime_error (Failed to repair for keyspace=ks, cf=cf,
range=(9041860168177642466, 9044815446631222376])
INFO 2020-03-09 16:09:09,217 [shard 0] repair - repair id 1 failed:
std::runtime_error ({shard 0: std::runtime_error (Failed to repair
for keyspace=ks, cf=cf, range=(9041860168177642466,
9044815446631222376])})
WARN 2020-03-09 16:09:09,217 [shard 0] repair - repair id 1 to sync data
for keyspace=ks, status=failed, keyspace does not exist any more,
ignoring it: std::runtime_error ({shard 0: std::runtime_error
(Failed to repair for keyspace=ks, cf=cf,
range=(9041860168177642466, 9044815446631222376])})
Refs #5942
It needs the config purely to feed one into thrift/transport
server, since the latter two no longer needs one, neither does
the former.
As a nice side effect -- some tests no longer have to carry
the cql_config on board.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
* seastar affc3a5107...5eaec672a2 (12):
> test_thread_custom_stack_size_failure: Use a larger custom stack
> test_thread_custom_stack_size: Use a larger custom stack
> log: correct help message
> perftune.py: verify NIC existence
> Merge "Fix various memory issues in http" from Rafael
> build: Fix IN_LIST usage
> future: Disable -Wuninitialized on a particular memcpy
> build: use IN_LIST for shorter cmake
> build: check support of "-fstack-clash-protection" before using it
> configure.py: Add "--verbose" flag
> configure.py: Make "cmake" command line human-readable
> net: dynamically adjust buffer sizes for posix connected_socket read operations
SimpleStrategy creates a list of endpoints by iterating over the set of
all configured endpoints for the given token, until we reach keyspace
replication factor.
There is a trivial coding bug when we first add at least one endpoint
to the list, and then compare list size and replication factor.
If RF=0 this never yields true.
Fix by moving the RF check before at least one endpoint is added to the
list.
Cassandra never had this bug since it uses a less fancy while()
loop.
Fixes#5962
Message-Id: <20200306193729.130266-1-kostja@scylladb.com>
Fixes#5899
When terminating (closing) a segment, we write a trailing block
of zero so reader can have an empty region after last used chunk
as end marker. This is due to using recycled, pre-allocated
segments with potentially non-zero data extending over the point
where we are ending the segment (i.e. we are not fully filling
the segment due to a huge mutation or similar).
However, if we reach end of segment writing the final block
(typically many small mutations), the file will end naturally
after the data written, and any trailing zero block would in fact
just extend the file further. While this will only happen once per
segment recycled (independent on how many times it is recycled),
it is still both slightly breaking the disk usage contract and
also potentially causing some disk stalls due to metadata changes
(though of course very infrequent).
We should only write trailing zero if we are below the max_size
file size when terminating
Adds a small size check to commitlog test to verify size bounds.
(Which breaks without the patch)
Message-Id: <20200226121601.15347-2-calle@scylladb.com>
Introduce a test which checks how different CQL features (DML, LWT,
MV) work when no replicas are available (e.g. because
they are all in an unavailable data center).
Specifically the test checks that when we SELECT with IN clause
and there are no available replicas, there is no crash (#5935).
Message-Id: <20200306192521.73486-3-kostja@scylladb.com>
The list of all endpoints for a query can be empty if we have
replication_factor 0 or there are no live endpoints for this token.
Do not access all_replicas.front() in this case.
Fixes#5935.
Message-Id: <20200306192521.73486-2-kostja@scylladb.com>
If base mutation has at least one row tombstone, its preimage log
entry is constructed from all the base columns.
Fixes#5709
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
In case a static and a clustering row is written at the same time, but
a clustering row with given key was not present, the preimage query was
incorrectly configured and no rows were returned. This resulted in an
empty preimage, while a preimage for static row should be present.
This patch fixes this and now the static row is correctly written to cdc
log in the case above.
Tests: unit(dev)
This change disallows creating CDC log tables for already existing
CDC log tables. CDC logs nested in that way are not really useful
and do not work at the moment, therefore disallowing their creation
prevents confusion.
Fixes#5967
Tests: unit(dev)
* piodul/5967-disallow-nested-cdc-logs:
cdc: disallow creating nested CDC logs
cql_repl: register schema extensions
... schema::get_partitioner and make schema::get_partitioner
return const&' from Piotr
Partitioners returned from get_partitioner are shared
and not supposed to be changed so let's use the type system
to enforce that.
dht::global_partitioner() is deprecated and will be removed
as soon as custom partitioners are implemented so it's best
to replace it with schema::get_partitioner.
Tests: unit(dev)
* hawk/global_partitioner_cleanup:
schema: get_partitioner return const&
compaction_manager: stop calling dht::global_partitioner()
sstable_datafile_test: stop calling dht::global_partitioner()
Previously the tokens were stored as strings
because token could have been represented in multiple ways.
Now token representation is always int64_t so we can
store them as ints in cdc description as well.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This change disallows creating CDC log tables for already existing CDC
log tables. CDC logs nested in that way are not really useful and do not
work at the moment, therefore disallowing their creation prevents
confusion.
Alternator and CDC, apart from enabling their experimental features,
need to have their schema extensions registered. This patch adds missing
registration of schema extensions to cql_repl, so that cql tests written
with Alternator or CDC in mind will properly work.
Fixes#5902 by making the LIKE restriction keep a vector
of matchers and apply them all to the column value.
Tests: unit (dev)
* dekimir/multiple-likes:
cql3: Allow repeated LIKE on same column
cql3: Forbid calling LIKE::values()
cql3: Move LIKE::_last_pattern to matcher
In order to properly validate not only network topology strategy,
but also other strategies, the checks are moved straight to
validate_replication_factor().
Also, the test case is extended with a too long integer
and a check for SimpleStrategy replication factor.
Fixes#3801
Tests: unit(dev)
Message-Id: <e0c3c3c36c589e1d440c9708a6dce820c111b8da.1583483602.git.sarna@scylladb.com>
Check that XML output of a test is valid and warn otherwise.
The following tests currently produce a warning:
boost/multishard_mutation_query_test
Message-Id: <20200305213501.52279-2-kostja@scylladb.com>
In order to prevent users from creating a network topology
strategy instance with invalid inputs, it's not enough to use
std::stol() on the input: a string "3abc" still returns the number '3',
but will later confuse cqlsh and other drivers, when they ask for
topology strategy details.
The error message is now more human readable, since for incorrect
numeric inputs it used to return a rather cryptic message:
ServerError: stol()
This commit fixes the issue and comes with a simple test.
Fixes#3801
Tests: unit(dev)
Message-Id: <7aaae83d003738f047d28727430ca0a5cec6b9c6.1583478000.git.sarna@scylladb.com>
... and reading cdc metadata' from Piotr
Currently, information on what cdc options are enabled
in a table - cdc metadata in short - is stored in two places:
In cdc column of the system_schema.scylla_tables,
In a cdc schema extension.
The former is used as a source of truth, i.e. a node reads cdc metadata
from that column, while the latter is used for cosmetic purposes
(e.g. cqlsh displays info on cdc based on this extension)
and is only written, but never read by the node.
Introducing the cdc column to scylla_tables made the logic
of schema agreement more complicated. As a first step of removing
this column, this PR makes the cdc schema extension as the
"source of truth" - a node will from now on read cdc metadata
from that extension.
The cdc column will be deprecated and removed in subsequent releases,
but it is left for now and will still be written to in order not to break
the logic of schema agreement.
Acked-by: Nadav Har-El <nyh@scylladb.com>
Refs: #5737
Tests: unit(dev), 2-node cluster upgrade under write load to a cdc-enabled table
* piodul/5737-cdc-schema-extension:
schema: get cdc options from schema extensions
alter_table_statement: fix indentation
cf_prop_defs: initialize schema extensions externally
cf_prop_defs: move checking of cdc support to ::validate
cf_prop_defs: pass database& to ::validate, not db::extensions&
unit tests: register cdc extension before tests
cdc: construct cdc_options directly inside cdc_extension
db::extensions: add shorthands for add_schema_extension
Moves initialization of schema extensions outside of cf_prop_defs. This
allows to construct these extensions once, and use them several times in
cd_prop_defs' methods without caching or recalculating them several
times.
Changes cf_prop_defs::validate function to take database& as an argument
instead of db::extensions&. This change will allow us to move the check
which asserts that the cluster supports CDC from `apply_to_builder` to
`validate` method.
Instead of storing a raw map of options inside `cdc_extension`, the
extension now converts them into `cdc_options` directly on construction.
This removes the need to construct `cdc_options` object multiple times.
With #5950 we changed the representation of stream_id
in CDC Log from two int columns to a single blob column.
This PR cleans up stream_id representation internally.
Now stream_id is stored as blob both in-memory and in
internal CDC tables.
Tests: unit(dev)
* hawk/stream_id_representation:
cdc: store stream_ids as blobs in internal tables
cdc: improve do_update_streams_description
cdc: Fix generate_topology_description
cdc: add stream_id::operator<
cdc: change stream_id representation
Fixes#5891
Refs #5899
When creating segments with the o_dsync option active, we write max_size
zeros to disk, to ensure actual disk blocks are allocated.
However, if we recycle a segment, we should, when not actually creating
a new file, check the existing size on disk, and only zero any blocks
not already allocated (i.e. if recycled file was smaller than max_size,
due to segement truncation on close).
test: unit
Message-Id: <20200226121601.15347-1-calle@scylladb.com>
from Kamil.
If a batch update is performed with a sequence of changes with a single
timestamp, they will now show up in CDC with a single timeuuid
in the cdc$time column, distinguished by different cdc$batch_seq_no values.
Fixes#5953.
Tests: unit(dev)
* haaawk/splitbatch:
cdc: use a single timeuuid value for a batch of changes
cdc: replace `split` with `for_each_change`
Now it will show the full info about range being streamed, like
range_streamer - Rebuild with 127.0.0.2 for keyspace=ks2, streaming [72, 96) out of 248 ranges
The [x, y) range is semi-open one, the full streaming progress
then can be logged like
... streaming [0, 16) out of 36 ranges <- first send
... streaming [16, 24) out of 36 ranges
... streaming [24, 36) out of 36 ranges <- last send
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200304101505.5506-1-xemul@scylladb.com>
If a batch update is performed with a sequence of changes with a single
timestamp, they will now show up in CDC with a single timeuuid in the
`time` column, distinguished by different `batch_seq_no` values.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Presently lightweight transactions piggy back the old
row value on prepare round response. If one of the participants
did not provide the old value or the values from peers don't match,
we perform a full read round which will repair the Paxos table and the
base table, if necessary, at all participants.
Capture the fact that read optimization has failed in a metric.
Message-Id: <20200304192955.84208-2-kostja@scylladb.com>
`for_each_change` is like `split` but it doesn't return a vector of
mutations representing each change; instead, it takes as a parameter
a function which gets called on each mutation.
This reduced the memory usage and allows to preserve common context
when handling each change (will be useful in next commits).
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The relocatable package requires a magic dynamic linker path for
"patchelf" to work correctly. Therefore, use the "get-dynamic-linker.sh"
script to unconditionally define a magic dynamic linker path to ensure
that building the relocatable package with ninja build ("ninja-build
build/<mode>/scylla-package.tar.gz") is always correct. Although the
path looks odd with a lot of leading slashes, it works outside
relocatable package too.
Message-Id: <20200305091919.6315-2-penberg@scylladb.com>
In preparation for moving dynamic linker flags to ninja build, move the
magic dynamic linker path generation to "reloc/get-dynamic-linker.sh"
script that configure.py can call.
Message-Id: <20200305084331.5339-1-penberg@scylladb.com>
In new CDC Log format stream_id is represented by a single
blob column so it makes sense to store it in the same form
everywhere - including internal CDC tables.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Use std::set::insert that takes range instead of
looping through elements and adding them one by one.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
In new CDC Log format we store only a single stream_id column.
This means generate_topology_description has to use appropriate
schema for generating tokens for stream_ids.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
New CDC Log format stores stream ids as blobs.
It makes sense to keep them internally in the same form.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Currently, writes to a static row in a base table are not reflected
at all in the corresponding cdc log. This patch causes such writes
to be properly logged.
Fixes: #5744
Tests: unit(dev)
* piodul/5744-handle-static-row-correctly-in-cdc:
cdc_test: add tests for handling static row
cdc: fix indentation in transformer::transform
cdc: handle static rows separately in transformer::transform
cdc: move process_cells higher (and fix captured variables)
cdc: reduce dependencies on captured variables in process_cells
cdc: fix preimage query for static rows
Until this patch, we used the default_smp_service_group() when bouncing
Alternator requests between shards (which is needed for LWT).
This patch creates a new smp_service_group for this purpose, which is
limited to 5000 concurrent requests (the same limit used for CQL's
bounce_request_smp_service_group). The purpose of this limit is to avoid
many shards admitting a huge number of requests and bouncing all of them
to the same shard who now can't "unadmit" these requests.
Fixes#5664.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200304170825.27226-1-nyh@scylladb.com>
We use boost test logging primarily to generate nice XML xunit
files used in Jenkins. These XML files can be bloated
with messages from BOOST_TEST_MESSAGE(), hundreds of megabytes
of build archives, on every build.
Let's use seastar logger for test logging instead, reserving
the use of boost log facilities for boost test markup information.
Now, if CDC is enabled, `paxos_response_handler::learn_decision()`
augments the base table mutation. The differences in logic between:
(1) `mutate_internal<std::vector<mutation>>()`
and
(2) `mutate_internal<std::vector<std::tuple<paxos::proposal, schema_ptr, ...>>>()`
make it necessary to separate "CDC mutations" from "base mutation"
and send them, respectively, to (1) and (2).
Gleb explained in #5869 why it became necessary to add CDC code to LWT
writes specifically, instead of doing it somewhere central that affects
all writes:
"All paths that do write goes through mutate_internally() eventually so it
would have been best to do augmentations there, but cdc chose to log only
certain writes and not others (unlike MV that does not care how write
happened) and mutate_internal have no idea which is which so I do not have
other choice but code duplication. ... paxos_response_handler::learn_decision
is probably the place to add cdc augmentation."
Fixes#5869
It is possible to produce an empty mutation using CQL. For example, the
following query:
DELETE FROM ks.tbl WHERE pk = 0 AND ck < 1 AND ck > 2;
will attempt to delete from an empty range of rows. This is translated
to the following mutation:
{ks.tbl {key: pk{000400000000}, token:-3485513579396041028}
{mutation_partition:
static: cont=1 {row: },
clustered: {}}}
Such mutation does not contain any timestamp, therefore it is difficult
to determine what timestamp was used while making the query. This is
problematic for CDC, because an entry in CDC log should be written with
the same timestamp as a part of the mutation.
Because an empty mutation does not modify the table in any way, we can
safely skip logging such mutations in CDC and still preserve the
ability to reconstruct the current state of the base table from full
CDC log.
Tests: unit(dev)
Before this patch, `transform` did not generate any log rows about
static row change. This commit fixes that - now, a log row is created if
a static row is changed, and this row is separate from the rows that
describe changes to the clustering rows.
This is a preparation for moving the lambda outside the for loop.
- `log_ck`, `pikey`, `pirow` are now passed as arguments,
- `value` is now a variable local to the lambda,
- `ttl` is now a variable local to the lambda that is returned.
Most test-methods log a message with their names upon entering them.
This helps in identifying the test-method a failure happened in in the
logs. Two methods were missing this log line, so add it.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200304155235.46170-1-bdenes@scylladb.com>
Regular compaction relies on compaction manager to run compaction jobs
until compaction strategy is satisfied. Resharding, on the other hand,
is an one-off operation which runs only once in compaction manager,
and leave the sstable set in such a way that the strategy is very
likely unsatisfied. We need to trigger regular compaction whenever
a resharding job replaces a shared sstable by an unshared sstable,
so that compaction will not fall way behind due to lots of new sstables
created by resharding process.
Fixes#5262.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200217144946.20338-1-raphaelsc@scylladb.com>
Merged patch series from Avi Kivity:
boost/multiprecision is a heavyweight library, pulling in 20,000 lines of code into
each header that depends on it. It is used by converting_mutation_partition_applier
and types.hh. While the former is easy to put out-of-line, the latter is not.
All we really need is to forward-declare boost::multiprecision::cpp_int, but that
is not easy - it is a template taking several parameters, among which are non-type
template parameters also defined in that header. So it's quite difficult to
disentangle, and fragile wrt boost changes.
This patchset introduces a wrapper type utils::multiprecision_int which _can_
be forward declared, and together with a few other small fixes, manages to
uninclude boost/multiprecision from most of the source files. The total reduction
in number of lines compiled over a full build is 324 * 23,227 or around 7.5
million.
Tests: unit (dev)
Ref #1https://github.com/avikivity/scylla uninclude-boost-multiprecision/v1
Avi Kivity (5):
converting_mutation_partition_applier: move to .cc file
utils: introduce multiprecision_int
tests: cdc_test: explicitly convert from cdc::operation to uint8_t
treewide: use utils::multiprecision_int for varint implementation
types: forward-declare multiprecision_int
configure.py | 2 +
concrete_types.hh | 2 +-
converting_mutation_partition_applier.hh | 163 ++-------------
types.hh | 12 +-
utils/big_decimal.hh | 3 +-
utils/multiprecision_int.hh | 256 +++++++++++++++++++++++
converting_mutation_partition_applier.cc | 188 +++++++++++++++++
cql3/functions/aggregate_fcts.cc | 10 +-
cql3/functions/castas_fcts.cc | 28 +--
cql3/type_json.cc | 2 +-
lua.cc | 38 ++--
mutation_partition_view.cc | 2 +
test/boost/cdc_test.cc | 6 +-
test/boost/cql_query_test.cc | 16 +-
test/boost/json_cql_query_test.cc | 12 +-
test/boost/types_test.cc | 58 ++---
test/boost/user_function_test.cc | 2 +-
test/lib/random_schema.cc | 14 +-
types.cc | 20 +-
utils/big_decimal.cc | 4 +-
utils/multiprecision_int.cc | 37 ++++
21 files changed, 627 insertions(+), 248 deletions(-)
create mode 100644 utils/multiprecision_int.hh
create mode 100644 converting_mutation_partition_applier.cc
create mode 100644 utils/multiprecision_int.cc
This reduces the number of translation units that depend on
boost/multiprecision from 354 to 30, and reduces the size of
database.i (as an example) from 406160 to 382933 (smaller
files will benefit more, relatively).
Ref #1
The goal is to forward-declare utils::multiprecision_int, something
beyond my capabilities for boost::multiprecision::cpp_int, to reduce
compile time bloat.
The patch is mostly search-and-replace, with a few casts added to
disambiguate conversions the compiler had trouble with.
After the varint data type starts using the new multiprecision_int type,
this code fails to compile. I expect that somehow the conversion from enum
class to cpp_int was allowed to succeed, and we ended up with a data_value
of type varint. The tests succeeded because the serialized representation
happened to be the same.
Previously we had stream_id_1 and stream_id_2 columns
of type long each. They were forming a partition key.
In a new format we want a single stream_id column that
forms a partition key. To be able to still store two
longs, the new column will have type blob and its value
will be concatenated bytes of two longs that
partition key is composed of.
We still want partition key to logically be two longs
because those two values will be used by a custom partitioner
later once we implement it.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
multiprecision_int is a wrapper around boost::multiprecision::cpp_int that adds
no functionality. The intent is to allow forward declration; cpp_int is so
complicated that just finding out what its true type is a difficult exercise, as
it depends on many internal declarations.
Because cpp_int uses expression templates, the implementation has to explicitly
cast to the desired type in many places, otherwise the C++ compile is presented
with too many choices, especially in conjunction with data_value (which can
convert from many different types too).
converting_mutation_partition_applier is a heavyweight class that is not
used in the hot path, so it can be safely out-of-lined. This moves
some includes to boost/multiprecision out of header files, where they
can infect a lot of code.
mutation_partition_view.cc's includes were adjusted to recover
missing dependencies.
The previous version errorneously used local db reference
which was propagated into another shard. This time carry
the sharded instance and use .local() as before.
tests: unit(dev)
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200303221729.31261-1-xemul@scylladb.com>
I found that a few variables in cql_test_env were wrapping sharded in
shared_ptr for no apparent reason. These patches convert them to plain
sharded<...>.
Currently, the Dockerfile installs the latest version of Scylla. Let's
add a VERSION argument to Dockerfile, which explicitly specifies the
version to ensure scripts, for example, always build the expected
version. If no VERSION is specified for "docker build", use the default
value of "666.development", which is the version number for latest
nightly.
"These two patches were made suspect of failing next promotion and
excluded from the original series."
* 'test.py.log' of https://github.com/kostja/scylla:
test.py: remove log output on success unless -s is specified
test.py: do not store entire log output in junit report.
The introduction of rsyslog had two errors in it.
Both errors are non fatal and the docker still works,
however, the system is left in a wrong state in which
supervisord marks rsyslogd service as failed (after several
failed retry attempts). Another bug in the configuration
causes rsyslog to output an error.
1) An inclusion command from a newer version was used
in rsyslogs main configuration file. This caused to rsyslog
to complain during startup but it didn't do much damage since
rsyslog converts every unrecognised command to a message command.
2) in the supervisord definition of the service, rsyslogd is ran
without the -n option which means it defaults to automatically
switch to the background. Supervisord interpret this as an unexpected
process termination and retries to start the process (unsuccessfully
because rsyslog protects itself from having multiple processes of
itself) and eventually marks it as down although it is fully up and
running.
This commit fixes both configuration problems.
Tests: Build and run docker and validate the errors are gone.
Fixes#5937
Currently, you have to build the relocatable package tarball with
./reloc/build_reloc.sh to be able to build an RPM out of it. You need to
do this because RPMS require SHA1 build-ids, but the build system does
not enforce that.
To prepare for adding RPM target to the ninja build, let's switch to
SHA1 build ID conditionally, because the performance difference between
xxhash and SHA1 is neglible. Rafael Avila de Espindola writes:
[...] the sha1 implementation in current lld is pretty fast. Linking
release scylla the times I get are
lld in fedora
fast 2.83739
sha1 3.51990
current lld
fast 2.6936
sha1 2.90250
And the sha1 implementation might get even faster:
https://bugs.llvm.org/show_bug.cgi?id=44138.
Message-Id: <20200303131806.22422-1-penberg@scylladb.com>
This reverts commit c6ddd21c50.
Uses database& instance across shards, which causes repair writer to
use the table object from the wrong shard.
Fixes#5907
"
This set cleans sstable_writer_config and surrounding sstables
code from using global storage_ and feature_ service-s and database
by moving the configuration logic onto sstables_manager (that
was supposed to do it since eebc3701a5).
Most of the complexity is hidden around sstable_writer_config
creation, this set makes the sstables_manager create this object
with an explicit call. All the rest are consequences of this change.
Tests: unit(debug), manual start-stop
"
* 'br-clean-sstables-manager-2' of https://github.com/xemul/scylla:
sstables: Move get_highest_supported_format
sstables: Remove global get_config() helper
sstables: Use manager's config() in .new_sstable_component_file()
sstable_writer_config: Extend with more db::config stuff
sstables_manager: Don't use global helper to generate writer config
sstable_writer_config: Sanitize out some features fields initialization
sstable_writer_config: Factor out some field initialization
sstables: Generate writer config via manager only
sstables: Keep reference on manager
test: Re-use existing global sstables_manager
table: Pass sstable_writer_config into write_memtable_to_sstable
Change rjson::get() to take std::string_view, instead of RapidJson's
version of that type, "StringRef". We already did the same change for
rjson::find() in a previous patch.
Not only is std::string_view more convenient for potential callers in Scylla,
this change also avoids a bug in FindMember() on StringRef where the length
is ignored (and instead, null-termination of the string is assumed).
This patch doesn't require any changes to callers, because we actually
had just a handful of remaining callers (most call sites switched to
rjson::find()), and all of them used string constants which could be
implicitly converted to StringRef or std::string_view just the same.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200303161019.1456-1-nyh@scylladb.com>
This patch adds a rjson::remove_member() wrapper to the RemoveMember
method, which takes a std::string_view. But beyond the convenience, this
actually works around a subtle bug in RemoveMember where, if given a
StringRef parameter, ignores its length (see upstream issue
https://github.com/Tencent/rapidjson/issues/1649).
In the one place we used RemoveMember, it forced us to copy the string
because it wasn't null-terminated. The solution proposed here involves
wrapping the string view in a GenericValue - which no longer needs to copy
the string, but still works around the bug.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200303143524.28300-1-nyh@scylladb.com>
Our rjson::find() convenience function used RapidJson's "StringRef" type,
which is almost exactly like std::string_view. If we switch to use
string_view as we do in this patch, a lot of call sites become much simpler.
Moreover, there was an even more important motivation for this patch:
the RapidJson FindMember() function we used in rjson::find() has a bug when
given a StringRef - although a StringRef contains a length, the FindMember()
code ignores it and expects the string to be null-terminated (see:
https://github.com/Tencent/rapidjson/issues/1649). In this patch, we wrap
the pointer and length of a std::string_view in an rjson::value, a code path
which bypasses the FindMember bug, and yet does not require copying the
string.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200303141814.26929-1-nyh@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5940 from
Kamil Braun:
Add a bunch of new structs describing a change made
to a table, and an extract_changes function which takes a mutation and
returns the set of changes contained in this mutation, separated by
timestamp and ttl.
Add a split function which uses extract_changes to split a mutation into separate mutations, each describing a single change.
Static rows are put into separate changes now.
The pre_image_select function was fixed to select pre_image data always when
there is a static row/clustered row change, even if there were e.g. additional
range tombstones.
Fixes: #5719.
Tests: unit(dev)
When a node is overloaded requests usually start to queue up. Timeouts
are supposed to prevent queues from exploding and causing an OOM. One
prominent queue that tends to explode is the smp queue as it didn't
support timeouts and so requests would sit in the queue until the target
shard would process them. If the target shard is heavily overloaded
requests might accumulate faster then they are processed, surely leading
to an OOM.
To prevent this use the recently introduces timeout to
`seastar::smp::submit_to()` and derived APIs to time out write requests
sitting in the smp queue. We simply use the request's own timeout
for this purpose.
Fixes: #5055
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200303131658.741720-1-bdenes@scylladb.com>
If the mutation contains separate logical changes (e.g. with different
timestamps and/or ttls), it will be split into multiple mutations, each
passed into transform.
Previously we wouldn't retrieve the preimage if the mutation contained
something different than static/clustered row updates, e.g. if it
contained a partition deletion.
However, there are mutations created from batch statements which can
contain both a partition deletion and a set of row updates with a later
timestamp. We want to retrieve the preimage too in this case.
This commit introduces a bunch of new structs describing a change made
to a table, and an `extract_changes` function which takes a mutation and
returns the set of changes contained in this mutation, separated by
timestamp and ttl.
The function checks if there are multiple timestamps and/or ttls inside
a mutation, which means separate changes should be created for this
mutation in CDC.
Merged pull request https://github.com/scylladb/scylla/pull/5910 by
Calle Wilund:
Rename metadata and data columns according to new spec
Also use transformation methods for names in all code + tests
to make switching again easier
Break up data column tuple
Data column is now pure frozen original type.
If column is deleted (set to null), a metadata column cdc$deleted_ is set to true, to distinguish null column == not involved in row operation
For non-atomic collections, a cdc$deleted_elements_ column is added, and when removing elements from collection this is where they are shown.
For non-atomic assign, the "cdc$deleted_" is true, and is set to new value.
column_op removed.
The header sits in many other headers, but there's a handy
schema_fwd.hh that's tiny and contains needed declarations
for other headers. So replace shema.hh with schema_fwd.hh
in most of the headers (and remove completely from some).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200303102050.18462-1-xemul@scylladb.com>
According to "new" spec:
Data column is now pure frozen original type.
If column is deleted (set to null), a metadata column
cdc$deleted_<name> is set to true, to distinguish
null column == not involved in row operation
For non-atomic collections, a cdc$deleted_elements_<name>
column is added, and when removing elements from collection
this is where they are shown.
For non-atomic assign, the "cdc$deleted_<name>" is true,
and <name> is set to new value.
column_op removed.
`query_result_builder` is movable but if you actually try to move it
after it having consumed some fragments it will blow up in your face
when you try to use it again. This is because its `mutation_querier`
member received a reference to its `query::result::partition_writer`. Of
course the reference to the latter was invalidated on move so the former
accessed invalid memory. Since `query::result::partition_writer` wasn't
actually used for anything other, just move it into the
`mutation_querier`, making `query_result_builder` actually safe to move.
Fixes: #3158
Message-Id: <20190830142601.51488-1-bdenes@scylladb.com>
The function in question uses future-based .when_enabled() subscription
on cluster_supports_truncation_table feature. This method is considered
to be unsafe, so here's the patch that changes it onto feature::listener.
The completion of the migration is only awaited by a single test, so
this waiting mechanism is also slightly simplified.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Support LIKE operator condition on column expressions.
NOTE: following the existing code, the LIKE pattern value is converted
to raw bytes and passed straight as bytes_view to like_matcher
without type checking; it should be checked/sanitized by caller.
Refs: #5777
Branch URL: https://github.com/alecco/scylla/tree/as_like_condition_2
Tests: unit ({dev}), unit ({debug})
NOTE: fail for unrelated test test_null_value_tuple_floating_types_and_uuids
Extracting a certain element from a collection is a common task I have
to do while debugging cores. For certain collections (c-array,
std::array) this is trivial, for others it is easy enough (std::vector),
but for some (std::list) this is a tiresome work-intensive process.
This convenience function allows getting a reference to any element of
the supported container types, returning them for further use in the
interactive session.
Currently only `std::list` and `std::vector` are supported.
To be a generic convenience function for dereferencing all sorts of
smart pointers. For now `std::unique_ptr`, `seastar::lw_shared_ptr` and
`seastar::foreign_ptr` are supported.
`table` is not registered with the database, and hence will not be
waited on during shutdown.
Stop it explicitly to prevent any asynchronous operation on it racing
with shutdown.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200302142845.569638-1-bdenes@scylladb.com>
* Mark cql3::attributes::raw class as final
* Change every occurrence of ::shared_ptr<attributes::raw>
to std::unique_ptr<...>
* Mark all methods in cql3::attributes::raw as const
* Remove redundant "_attrs" ptr copy in insert_json_statement,
use one from raw::modification_statement
* Fix odd indentation in cql3/statements/update_statement.cc
Tests: unit-tests (dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200301223708.99883-1-pa.solodovnikov@scylladb.com>
_get_next() was recursively calling itself with index - 1 if index was
> 0. When we reached the desired element we always tried to use
member_types[0] as the type, which is incorrect since member_types
contains all types and doesn't change in get().
Fix by replacing recursion with iteration so that we keep the original
index.
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <1582900804-18681-1-git-send-email-tgrabiec@scylladb.com>
When wrapping rapidjson routines with safer, yieldable code,
parsing information was lost, because the JSON reader was not
checked for parsing errors before further processing.
That resulted in nearly all parsing errors being reduced to
"Assertion failed: StackSize() != 1". After this patch,
all various errors (missing quotations, colons, object names,
etc.) are properly returned for the user.
Message-Id: <968ce2f7539bf33d3eb829f0ab431b788d291602.1583134221.git.sarna@scylladb.com>
When running the Alternator tests, we don't care about verifying the
pedigree of the SSL certificates - we actually know the ones we use
in our test setups are fake, and not signed by any respectable certificate
authority.
We already use "verify=False" in many requests to avoid the certificate
checking, but then we start getting scary-looking warning messages about
an "Unverified HTTPS request is being made.". There's a way to disable
these warnings, but we only did in some cases, and there were still some
tests that show these warnings. Let's do it once, in a way that affects
all tests.
Message-Id: <20200301175607.8841-1-nyh@scylladb.com>
When incoming mutation contains live row marker the `operation` is
described as "insert", not as an "update".
Also, I extended the test case "test_row_delete" with one insert,
which is expected to log different value of `operation` than update
or delete. Renamed the test case accordingly.
Test cases that relied on "update" being the same as "insert" are
updated accordingly (`test_pre_image_logging`, `test_cdc_across_shards`,
`test_add_columns`).
Fixes#5723
"
Timeouts defaulted to `db::no_timeout` are dangerous. They allow any
modifications to the code to drop timeouts and introduce a source of
unbounded request queue to the system.
This series removes the last such default timeouts from the code. No
problems were found, only test code had to be updated.
tests: unit(dev)
"
* 'no-default-timeouts/v1' of https://github.com/denesb/scylla:
database: database::query*(), database::apply*(): remove default timeouts
database: table::query(): remove default timeout
mutation_query: data_query(): remove default timeout
mutation_query: mutation_query(): remove default timeout
multishard_mutation_query: query_mutations_on_all_shards(): remove default timeout
reader_concurrency_semaphore: wait_admission(): remove default timeout
utils/logallog: run_when_memory_available(): remove default timeout
Both rapidjson library and DynamoDB induce enough corner cases
for incorrect JSON, that the simplest way out is to simply
conform back to ValidationException in all cases.
This commit comes with an updated test, which is now aware
of 3 possible outcomes for an incorrect JSON:
a ValidationException, a SerializationException and HTTP 404.
Message-Id: <5e39d2dc077f4ea5ce360035a4adcddaf3a342a0.1582876734.git.sarna@scylladb.com>
This reverts commit 65aadad9a6. It causes
crashes (due to the coredump test) during package install, since scylla_coredump_setup
is called from rpm postinstall. The test should be done only from scylla_setup (and
the user should be warned).
Fixes#5916.
"
As part of avoiding static initialization order problems I want to
switch a few global sstring to constexpr std::string_view. The
advantage being that a constexpr variable doesn't need runtime
initialization and therefore cannot be part of a static initialization
order problem.
In order to do the conversion I needed to convert a few APIs to use
std::string_view instead of sstring and const sstring&.
These patches are the simple cases that are also an improvement in
their own right.
"
* 'espindola/string_view' of https://github.com/espindola/scylla: (22 commits)
test: Pass a string_view to create_table's callback
Pass string_view to the schema constructor
cql3: Pass string_view to the column_specification constructor
Pass string_view to keyspace_metadata::new_keyspace
Pass string_view to the keyspace_metadata constructor
utils: Use std::string as keys in nonstatic_class_registry
utils: Pass a string_view to class_registry::to_qualified_class_name
auth: Return a string_view from authorizer::qualified_java_name
auth: Return a string_view from authenticator::qualified_java_name
utils: Pass string_view to is_class_name_qualified
test: Pass a string_view to create_keyspace
Pass string_view to no_such_column_family's constructor
perf_simple_query: Pass a string_view to make_counter_schema
Pass string_view to the schema_builder constructor
types: Add more data_value constructors
transport: Pass a string_view to cql_server::connection::make_autheticate
transport: Pass a string_view to cql_server::response::write_string
cql3: Pass std::string_view to query_processor::compute_id
cql3: Remove unused variable
cql3: Pass a string_view to cf_statement::prepare_keyspace
...
One of them uses global storage_proxy instance, but since
it is not used -- remove it not to encourage anybody to start
calling one.
Another call uses the db.find_keyspace to check if a keyspace
exists, while there's a nicer db.has_keyspace helper (which
doesn't throw exceptions) so use it.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200228123644.13931-1-xemul@scylladb.com>
This moves sstring copies from the callers to the constructor
implementation.
While at it, move the implementation out-of-line.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The sstring::compare functions was never updated to work with
std::string_view. We could fix that, but it seems better to just
switch to std::string.
With a working compare function we can avoid copying the argument
passed to to_qualified_class_name when an entry is found in the map.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This gives more flexibility to the implementations as they now don't
need to construct a sstring.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This gives more flexibility to the implementations as they now don't
need to construct a sstring.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
With this we can construct a data_value from any string type. This
also avoids a few sstring copies.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This avoids a copy in the callers. While at it, also make this
function non-virtual since it is never overwritten.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This moves the string copy from the callers to the implementation of
to_internal_name.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
"
Reverse queries work by reading an entire partition into memory, then
start emitting its rows in reverse order. It is easy to see how this can
lead to disasters combined with large partitions. In fact a handful of
such reverse queries on large partitions is enough to bring a node down.
To prevent this, abort reverse queries, when we find out that the size
of the partition is larger than a limit. This might be annoying to users,
but I'm sure it is not as annoying as their nodes going down.
The limit is configurable via `max_memory_for_unlimited_query`
configuration option, which is 1MB by default. This limit is propagated
to each table, system tables having no limit. This limit is planned to
be used by other queries capable of consuming unlimited amount of
memory, like unpaged queries. Not in this series.
The proper solution would be to read the data in reverse (#1413), but
that is a major effort. In the meanwhile make sure the unsuspecting user
won't bring their nodes down with an innocent looking ordering
directive.
Note that for calculating the memory footprint of the
partition-in-question, only the clustering rows are used. This should be
fine, the 1MB limit is conservative enough that an eventual overshoot
caused by the omitted range tombstones and the static row would not make
a big difference.
Fixes: #5804
"
* 'limit-reverse-query-memory-consumption/v3' of https://github.com/denesb/scylla:
flat_mutation_reader: make_reversing_reader(): add memory limit
db/config: add config memory limit of otherwise unlimited queries
utils::updateable_value: add operator=(T)
flat_mutation_reader: expose reverse reader as a standalone reader
A static constructor was used to initialize update_row_query. That
constructor would call meta::roles_table::qualified_name() which would
access AUTH_KS which is also initialized by a static constructor in
another file, so the construction order is not guaranteed.
This change turns update_row_query into a function with a static local
variable in it. The static local is initialized at first use, fixing
the problem.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200227163916.19761-1-espindola@scylladb.com>
Merged patch series by Piotr Sarna:
This series makes json parsing yieldable in order
to prevent reactor stalls. It's done by:
1. Extracting the parsing stage out of alternator executor
2. Moving the parsing stage to a separate service,
which uses a static seastar thread (parallelism: 1)
3. Wrapping rjson parsing routines with a yieldable parser,
which takes advantage of running in a seastar thread
and occasionally performs maybe_yield()
Step 2 above is only used for JSON's big enough to potentially
create stalls - small requests will be parsed immediately,
without being redirected to a static thread.
Handling a PutItem operation with large JSONs
on my machine takes approximately:
1MB doc: ~30ms
3MB doc: ~90ms
12MB doc: ~350ms
out of which parsing itself is around:
1MB doc: ~7ms
3MB doc: ~20ms
12MB doc: ~80ms
(bonus: 400KiB doc: ~2ms)
; the document was a single object full of small items,
which triggers many allocations during parsing.
The above numbers were roughly the same before and after
the series, but the 12MB document did not cause reactor
stalls after the patch.
Note: writing the JSON can still be a source of stalls,
especially for large documents.
Note2: DynamoDB limits single value size to 400KiB,
but for batches it will be 16MiB total request size
Note3: If parallelism ever proves to be an issue,
it's easily increasable by spawning more static threads.
Refs: #5742
Tests: alternator(local)
manual
Piotr Sarna (12):
alternator: break lines in server callbacks
alternator: allow moving the request from rmw operation
alternator: move parsing in front of executor
alternator: convert parse to std::string_view
alternator: implement json parser inside the server
alternator: remove rjson::parse_raw
alternator: make rjson yieldable in thread context
alternator: fix returning raw JSON errors
alternator: change json errors class to SerializationException
alternator-test: rename large requests test to 'manual requests'
alternator-test: extract getting signed request helper
alternator-test: add tests for incorrect JSON documents
...ge_requests.py => test_manual_requests.py} | 53 +++--
alternator/executor.cc | 203 ++++++++----------
alternator/executor.hh | 33 +--
alternator/rjson.cc | 47 +++-
alternator/rjson.hh | 7 +-
alternator/rmw_operation.hh | 1 +
alternator/serialization.cc | 9 +-
alternator/server.cc | 111 ++++++++--
alternator/server.hh | 20 +-
9 files changed, 310 insertions(+), 174 deletions(-)
rename alternator-test/{test_large_requests.py => test_manual_requests.py} (70%)
When has_relevant_range_on_this_shard() found a relevant range, it will unnecessarily
iterate through the end. Verified manually that this could be thousands of pointless
iterations when streaming data to a node just added. The relevant code could be
simplified by de-futurizing it but I think it remains so to allow task scheduler
to preempt it if necessary.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200220224048.28804-2-raphaelsc@scylladb.com>
This test suite can then be the parent of tests which use custom,
potentially not validated input in order to test alternator
against data not easy to push via boto3 or Python, due to their
implementation details.
Previously, alternator server was not directly sharded - and instead
kept a helper http server control class, which stored sharded http
server inside. That design is confusing and makes it hard to expand
alternator server with new sharded attributes, so from now on
the alternator server is itself sharded<>.
Tests: alternator-test(local, smp==1&smp==4)
Fixes#5913
Message-Id: <b50e0e29610c0dfea61f3a1571f8ca3640356782.1582788575.git.sarna@scylladb.com>
In order to be consistent with DynamoDB - a parsing error on incorrect
JSON input is reported as SerializationException instead of
ValidationException.
A couple of places in executor code leaked raw JSON errors to the user
instead of formulating a proper ValidationException message.
These places are now fixed, and the next patch in this series will
act as a regression checker, since all JSON errors will be returned
as SerializationException, not ValidationException instances.
In order to fight reactor stalls, rjson parsing and writing
routines can now yield if they run in seastar thread context.
In order to run a yieldable version of the parser which needs
to be run in seastar thread context, use parse_yieldable()
instead of parse().
The json parser runs in a static thread which accepts and parses
documents. Documents smaller than a parsing threshold
(currently: 16KiB) will be parsed in place without yielding.
The assumption is that most alternator requests are small
and there's no need to parse them in a yieldable way,
which also induces overhead. For reference, parsing a 128KiB
document made of many small objects with rapidjson takes
around 0.5 millisecond, and a 16KiB document is parsed
in around 0.06ms - a value small enough not to disturb
Seastar's current value of 0.5ms task quota too much.
Parsing a request string into JSON happens as a first thing
in every request, so it can be performed before calling
any executor callbacks. The most important thing however,
is that making parsing a separate stage allows certain optimizations,
e.g. running all parsing in a single seastar thread, which allows
adding yields to rjson parsing later.
If the reversing requires more memory than the limit, the read is
aborted. All users are updated to get a meaningful limit, from the
respective table object, with the exception of tests of course.
We have a few kind of queries whose memory consumption is not limited at
all. One of these is reverse queries, which reads entire partitions into
memory, before reversing them. These partitions can be larger than
memory and thus such a query can single-handedly cause OOM.
This patch introduces a configuration for a memory limit for such
queries. This will serve as a hard limit and queries which attempt to
use more memory than this, will be aborted.
The limit is propagated to table objects, with the intention of keeping
system tables unlimited. These tables are usually small and initiators
of system queries are not prepared for failures.
Currently reverse reads just pass a flag to
`flat_mutation_reader::consume()` to make the read happen in reverse.
This is deceptively simple and streamlined -- while in fact behind the
scenes a reversing reader is created to wrap the reader in question to
reverse partitions, one-by-one.
This patch makes this apparent by exposing the reversing reader via
`make_reversing_reader()`. This now makes how reversing works more
apparent. It also allows for more configuration to be passed to the
reversing reader (in the next patches).
This change is forward compatible, as in time we plan to add reversing
support to the sstable layer, in which case the reversing reader will
go.
No reason to disallow this. We still forbid mixing LIKE and non-LIKE
relations on the same column.
Fixes#5902.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
The CMake build system in seastar.git exports the package to CMake
package registry. However, we don't use it when building from scylla.git
(we link to seastar directly) and get the following warning when
building with "dbuild" (that does not bind mount $HOME/.cmake):
CMake Warning at CMakeLists.txt:1180 (export):
Cannot create package registry file:
/home/penberg/.cmake/packages/Seastar/3b6ede62290636bbf1ab4f0e4e6a9e0b
No such file or directory
Let's just disable the package registry for our builds by setting the
CMAKE_EXPORT_NO_PACKAGE_REGISTRY CMake option as discussed here to make
the warning go away:
https://cmake.org/cmake/help/v3.4/variable/CMAKE_EXPORT_NO_PACKAGE_REGISTRY.html
Message-Id: <20200227092743.27320-1-penberg@scylladb.com>
To install scylla using install.sh easily, we need to run following things:
- add scylla user/group
- configure scylla.yaml
- run scylla_post_install.sh
But we don't want to run them when we build .rpm/.deb package,
we also need to add --packaging option to skip them.
Fixes#5830
Instead of keeping the LIKE pattern in a restriction object (as we
currently do), keep it in like_matcher. Also move the
pattern-idempotence check from the restriction to the matcher.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
"
Here is a simple introduction to the node operations scylla supports and
some of the issues.
- Replace operation
It is used to replace a dead node. The token ring does not change. It
pulls data from only one of the replicas which might not be the
latest copy.
- Rebuild operation
It is used to get all the data this node owns form other nodes. It
pulls data from only one of the replicas which might not be the
latest copy.
- Bootstrap operation
It is used to add a new node into the cluster. The token ring
changes. Do no suffer from the "not the latest replica” issue. New
node pulls data from existing nodes that are losing the token range.
Suffer from failed streaming. We split the ranges in 10 groups and we
stream one group at a time. Restream the group if failed, causing
unnecessary data transmission on wire.
Bootstrap is not resumable. Failure after 99.99% of data is streamed.
If we restart the node again, we need to stream all the data again
even if the node already has 99.99% of the data.
- Decommission operation
It is used to remove a live node form the cluster. Token ring
changes. Do not suffer “not the latest replica” issue. The leaving
node pushes data to existing nodes.
It suffers from resumable issue like bootstrap operation.
- Removenode operation
It is used to remove a dead node out of the cluster. Existing nodes
pulls data from other existing nodes for the new ranges it own. It
pulls from one of the replicas which might not be the latest copy.
To solve all the issues above. We could use repair based node operation.
The idea behind repair based node operations is simple: use repair to
sync data between replicas instead of streaming.
The benefits:
- Latest copy is guaranteed
- Resumable in nature
- No extra data is streamed on wire
E.g., rebuild twice, will not stream the same data twice
- Unified code path for all the node operations
- Free repair operation during bootstrap, replace operation and so on.
Fixes: #3003Fixes: #4208
Tests: update_cluster_layout_tests.py + replace_address_test.py + manual test
"
* 'repair_for_node_ops' of https://github.com/asias/scylla:
docs: Add doc for repair_based_node_ops
storage_service: Enable node repair based ops for bootstrap
storage_service: Enable node repair based ops for decommission
storage_service: Enable node repair based ops for replace
storage_service: Enable node repair based ops for removenode
storage_service: Enable node repair based ops for rebuild
storage_service: Use the same tokens as previous bootstrap
storage_service: Add is_repair_based_node_ops_enabled helper
config: Add enable_repair_based_node_ops
repair: Add replace_with_repair
repair: Add rebuild_with_repair
repair: Add do_rebuild_replace_with_repair
repair: Add removenode_with_repair
repair: Add decommission_with_repair
repair: Add do_decommission_removenode_with_repair
repair: Add bootstrap_with_repair
repair: Introduce sync_data_using_repair
repair: Propagate exception in tracker::run
* seastar 7a3b4b4e4e...affc3a5107 (6):
> Merge "Add the possibility to remove rules from routes" from Pavel
> stall_detector: expose correct clock type to use
> queue: add has_blocked_consumer() function
> Merge "core: reduce memory use for idle connections" from Avi
> testing: Enable abort_on_internal_error on tests
> core: Add a on_internal_error helper
When we test Alternator on its HTTPS port (i.e., pytest --https),
we don't want requests to verify the pedigree of the SSL certificate.
Our "dynamodb" fixture (conftest.py) takes care of this for most of
the tests, but a few tests create their own requests and need to pass the
"verify=False" option on their own. In some tests, we forgot to do
this, and this patch fixes three tests which failed with "pytest --https".
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200226142330.27846-1-nyh@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5897
from Juliusz Stasiewicz:
Column operation now contains operation::row_delete (== 2)
after queries like delete from tbl where pk=x and ck=y;. Before
this patch row deletes were treated as updates, which was incorrect
because updates do not contain row tombstones (and row deletes do).
Refs #5709
Column `operation` now contains `operation::row_delete` (== 2)
after queries like `delete from tbl where pk=x AND ck=y;`. Before
this patch row deletes were treated as updates, which was incorrect
because updates do not contain row tombstones (and row deletes do).
Refs #5709
Merged patch series from Piotr Sarna:
Alternator shutdown routines were only registered in main.cc,
but it's not enough - other operations, like decommision,
also rely on shutting down client servers.
In order to remedy the situation, a notion of client shutdown
listeners is introduced to storage service.
A shutdown listener implements a callback used by the storage
service when client servers need to shut down, and at the same
time it does not force storage service to keep a reference
for the client service itself.
NOTE: the interface can also be used later to provide
proper shutdown routines for redis and any other future APIs.
Fixes#5886
Tests: alternator-test(local, including a shutdown during the run)
Piotr Sarna (4):
storage_service: make shutdown_client_servers() thread-only
storage_service: add client shutdown hook
main: make alternator shutdown hook-based
main: reduce scope of alternator services
main.cc | 18 +++++++++---------
service/storage_service.cc | 22 +++++++++++++++++-----
service/storage_service.hh | 15 ++++++++++++++-
3 files changed, 40 insertions(+), 15 deletions(-)
On some environment systemd-coredump does not work with symlink directory,
we can use bind-mount instead.
Also, it's better to check systemd-coredump is working by generating coredump.
Fixes#5753
With the new shutdown routines in place, alternator executor
and server do not need to be declared outside of the `if` clause
which conditionally sets up alternator.
In order to properly handle not only shutdown, but also
decommission, drain and similar operations, alternator
shutdown is now registered as a client shutdown hook,
which allows storage service to trigger its shutdown routines.
Fixes#5886
The shutdown hook interface can be used later by additional
client interfaces (e.g. alternator, redis) to register
shutdown routines for various operations: Scylla shutdown,
node decommission, drain, etc. It also decouples
the services themselves from being part of the storage
service, since it's huge enough as it is.
Until now, PutItem or UpdateItem could be used to insert almost any JSON
as an attribute's value - even those that do not match DynamoDB's typed
value specification.
Among other things, the new validation allows us to reject empty sets,
strings or byte arrays - which are (somewhat artificially) forbidden in
DynamoDB.
Also added tests for the empty sets, strings and byte arrays that should
be rejected.
Fixes#5896
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200225150525.4926-1-nyh@scylladb.com>
DynamoDB does not support empty sets. Operations which remove elements
from a set attribute should remove the attribute when the last item is
removed - not leave an empty set as it incorrectly does now.
Incidentally, the same patch fixes another bug - deleting elements from
a non-existent set attribute should be allowed (and do nothing), not fail
as it does now.
This patch also includes tests for both bugs.
Fixes#5895
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200225125343.31629-1-nyh@scylladb.com>
We have not yet implemented the DELETE-with-value and ADD operations in
UpdateItem's old-style "AttributeUpdates" parameter - see issue #5864
and issue #5893, respectively
This patch include comprehensive tests for both features. The new tests
pass on DynamoDB, but currently xfails on Alternator - until these
features will be implemented.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200225105546.25651-1-nyh@scylladb.com>
Currenly `get_text_range()` uses heuristics about which ELF section
actually contains the text for the main executable. It appears that this
fails from time-to-time and we have to adjust the heuristics.
We don't really have to guess however, a much better method of
determining the section hosting text is to find a vtable pointer and
locate the section it resides in. For this, we use the
`reactor::_backend` as a canary. When this is not available, we fall
back to the pre-existing heuristics.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200225164719.114500-1-bdenes@scylladb.com>
Fixes#5669
This implements non-atomic collection and UDT handling for
both cdc preimage + delta.
To be able to express deltas in a meaningful way (and reconstruct
using it), non-atomic values are represented somewhat
differently from regular values:
* maps - stored as is (frozen)
* sets - stored as is (frozen)
* lists - stored as map<timeuuid, value> (frozen)
this allows reconstructing the list, as otherwise
things like list[0] = value cannot be represented
in a meaningful way
* udt - stored as tuple<tuple<field0>, tuple<field1>...> (frozen)
UDTs are normally just tuples + metadata, but we need to
distinguish the case of outer tuple element == null, meaning
"no info/does not partake in mutation" from tuple element
being a tuple(null) (i.e. empty tuple), meaning "set field to
null"
* seastar 8b6bc659c7...7a3b4b4e4e (3):
> Merge "Add custom stack size to seastar threads" from Piotr
Ref #5742.
> expiring_fifo: Optimize memory usage for single-element lists
Ref #4235.
> Close connection, when reach to max retransmits
The global get_highest_supported_format helper and its declaration
are scattered all over the code, so clean this up and prepare the
ground for moving _sstables_format from the storage_service onto
the sstables_manager (not this set).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Finally, the thing is not used by anyone and can be removed.
This greatly relaxes the sstables -> storage_service dependency.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Applauded-by: Benny Halevy <bhalevy@scylladb.com>
This is the last place left that calls for global get_config(),
switch it onto _sst_manager.config().
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The enable_sstable_key_validation and summary_bytes_cost are used
in sstables writing code, keeping them on sstable_writer_config
removes more calls to global get_config().
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The main goal of this patch is to stop using get_config() glbal
when creating the sstable_writer_config instance.
Other than being global the existing get_config() is also confusing
as it effectively generates 3 (three) sorts of configs -- one for
scylla, when db config and features are ready, the other one for
tests, when no storage service is at hands, and the third one for
tests as well, when the storage service is created by test env
(likely intentionally, but maybe by coincidence the resulting config
is the same as for no-storage-service case).
With this patch it's now 100% clear which one is used when. Also
this makes half the work of removing get_config() helper.
The db::config and feature_service used to initialize the managers
are referenced by database that creates and keeps managers on,
so the references are safe.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Similar to previous patch -- initialize config fields from features
in configurator, not in default initializers.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The promoted_index_block_size is taken from db config in two places.
Factor this out and, at the same time, stop keeping it as std::optional.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The sstable_writer_config creation looks simple (just declare
the struct instance) but behind the scenes references storage
and feature services, messes with database config, etc.
This patch teaches the sstables_manager generate the writer
config and makes the rest of the code use it. For future
safety by-hands creation of the sstable_writer_config is
prohibited.
The manager is referenced through table-s and sstable-s, but
two existing sstables_managers live on database object, and
table-s and sstable-s both live shorter than the database,
this reference is save.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This is needed for further patching. The sstables_manager outlives
all sstables objects, so it's safe.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The sstables_manager in scylla binary outlives the sstables objects
created by it, this makes it possible to add sstable->manager reference
and use it. In unit tests there are cases when sstables::test_env that
keeps manager in _mgr field is destroyed right after sstable creation
(e.g. -- in the boost/sstable_mutation_test.cc ka_sst() helper).
Fix this by chaning the _mgr being reference on the manager and
initialize it with already existing global manager. Few exceptions
from this rule that need to set own large data handler will create
the sstable_manager their own.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The latter creates the config by hands, but the plan is to
create it via sstables_manager. Callers of this helper are the
final frontiers where the manager will be safely accessible.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
- Bootstrap operation
It is used to add a new node into the cluster. The token ring changes.
Do not suffer from the "not the latest replica” issue. New node pulls
data from existing nodes that are losing the token range.
Suffer from failed streaming. We split the ranges in 10 groups and we
stream one group at a time. Restream the group if failed, causing
unnecessary data transmission on wire.
Bootstrap is not resumable. Failure after 99.99% of data is streamed.
If we restart the node again, we need to stream all the data again even
if the node already has 99.99% of the data.
Fixes: #3003Fixes: #4208
Tests: update_cluster_layout_tests.py + replace_address_test.py + manual test
- Decommission operation
It is used to remove a live node form the cluster. Token ring
changes. Do not suffer “not the latest replica” issue. The leaving
node pushes data to existing nodes.
Fixes: #3003Fixes: #4208
Tests: update_cluster_layout_tests.py + replace_address_test.py + manual test
- Replace operation
It is used to replace a dead node. The token ring does not change. It
pulls data from only one of the replicas which might not be the
latest copy.
Fixes: #3003Fixes: #4208
Tests: update_cluster_layout_tests.py + replace_address_test.py + manual test
This patch adds a warning of deprecation to DTCS. In a follow up step,
we will start requiring a flag for it to be enabled to make sure users
notice.
For now we'll just be nice and add a warning for the log watchers.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200224164405.9656-1-glauber@scylladb.com>
Since we set 'eth0' as default NIC name, we get following error when running scylla_setup in non-interactive mode without --nic parameter:
$ sudo scylla_setup --setup-nic-and-disks --no-raid-setup --no-verify-package --no-io-setup
NIC eth0 doesn't exist.
It looks strange since user actually does not specified 'eth0', they might forget to specify --nic.
I think we should shows up usage, when eth0 is not available on the system.
Fixes#5828
Changes the name of storage_proxy::mutate_hint_from_scratch function to
another name, whose meaning is more clear: send_hint_to_all_replicas.
Tests: unit(dev)
It seems like *.service is conflicting on install time because the file
installed twice, both debian/*.service and debian/scylla-server.install.
We don't need to use *.install, so we can just drop the line.
Fixes#5640
Aggregate functions on counters do not exist. Until now counters
could, at best, fall back to blob->blob overloads, e.g.:
```
cqlsh> select max(cnt) from ks.tbl;
system.max(cnt)
----------------------
0x000000000000000a
(1 rows)
cqlsh> select sum(entities) from ks.tbl;
InvalidRequest: Error from server: code=2200 [Invalid query]
message="Invalid call to function sum, none of its type signatures match
[...]
```
Meanwhile, counters are compatible with bigints (aka. `long_type'),
so bigint overloads can be used on them (e.g. sum(bigint)->bigint).
This is achieved here by a special rule in overload resolution, which
makes `selector' perceive counters as an `EXACT_MATCH' to counter's
underlying type (`long_type', aka. bigint).
Until now, attempts to print counter update cell would end up
calling abort() because `atomic_cell_view::value()` has no
specialized visitor for `imr::pod<int64_t>::basic_view<is_mutable>`,
i.e. counter update IMR type. Such visitor is not easy to write
if we want to intercept counters only (and not all int64_t values).
Anyway, linearized byte representation of counter cell would not
be helpful without knowing if it consists of counter shards or
counter update (delta) - and this must be known upon `deserialize`.
This commit introduces simple approach: it determines cell type on
high level (from `atomic_cell_view`) and prints counter contents by
`counter_cell_view` or `atomic_cell_view::counter_update_value()`.
Fixes#5616
By default, `/usr/lib/rpm/find-debuginfo.sh` will temper with
the binary's build-id when stripping its debug info as it is passed
the `--build-id-seed <version>.<release>` option.
To prevent that we need to set the following macros as follows:
unset `_unique_build_ids`
set `_no_recompute_build_ids` to 1
Fixes#5881
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
The official documentation language of Scylla is English, not French.
So correct the word "existant", which appeared several times throughout
Alternator's tests, to "existent".
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200221224221.31237-6-nyh@scylladb.com>
This patch completes the support for the ReturnValues parameter for
the UpdateItem operation. This parameter has five settings - NONE, ALL_OLD,
ALL_NEW, UPDATED_OLD and UPDATED_NEW. Before this patch we already
supported NONE and ALL_OLD - and this patch completes the support for the
three remaining modes: ALL_NEW, UPDATED_OLD and UPDATED_NEW.
The patch also continues to improve test_returnvalues.py with additional
corner cases discovered during the development. After this patch, only
one xfailing test remains - testing updates to nested document paths,
which we do not yet support (even without the ReturnValues parameter).
After this patch, the support of ReturnValues is complete - for all
operations (UpdateItem, PutItem and DeleteItem) and all of its possible
settings.
Fixes#5053
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200221224221.31237-5-nyh@scylladb.com>
The rjson::set_with_string_name() utility function copies the given
string into the JSON key. The existing implementation required that this
input string be an std::string&, but a std::string_view would be fine too,
and I want to use it in new code to avoid yet another unnecessary copy.
Adding the overloads also exposes a few places where things were
implicitly converted to std::string and now cause an ambiguity - and
clearing up this ambiguity also allowed me to find places where this
conversion was unnecessary.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200221224221.31237-4-nyh@scylladb.com>
UpdateItem operations usually need to add a row marker:
* An empty UpdateItem is supposed to create a new empty item (row).
Such an empty item needs to have a row marker.
* An UpdateItem to add an attribute x and then later an UpdateItem
to remove this attribute x should leave an empty item behind.
This means the first UpdateItem needed to add a row marker, so
it will be left behind after the second UpdateItem.
So the existing code always added a row marker in UpdateItem.
However, there is one case where we should NOT create the row marker:
When the UpdateItem operation only has attribute deletions, and nothing
else, and it is applied to a key with no pre-existing item, DynamoDB
does not create this item. So neither should we.
This patch includes a new test for this test_update_item_non_existent,
which passes on DynamoDB, failed on Alternator before this patch, and
passes after the patch.
Fixes#5862.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200221224221.31237-3-nyh@scylladb.com>
In issue #5698 I raised a theory that we might have a bug when
BatchWriteItem is given two writes to the *same* key but in two different
tables. The test added here verifies that this theory was wrong, and
this case already works correctly.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200221224221.31237-2-nyh@scylladb.com>
This series adds an option to the API that supports deleting
a specific table from a snapshot.
The implementation works in a similar way to the option
to specify specific keyspaces when deleting a snapshot.
The motivation is to allow reducing disk-space when using
the snapshot for backup. A dtest PR is sent to the dtest
repository.
Fixes#5658
Original PR #5805
Tests: (database_test) (dtest snapshot_test.py:TestSnapshot.test_cleaning_snapshot_by_cf)
* amnonh/delete_table_snapshot:
test/boost/database_test: adopt new clear_snapshot signature
api/storage_service: Support specifying a table when deleting a snapshot
storage_service: Add optional table name to clear snapshot
* amnonh/delete_table_snapshot:
test/boost/database_test: adopt new clear_snapshot signature
api/storage_service: Support specifying a table when deleting a snapshot
storage_service: Add optional table name to clear snapshot
The error message (silently) changed to "DB index is out of range" the
following commit:
c7a4e694ad
The new error message is part of Redis 4.0, released in 2017, so let's
switch Scylla to use the new one.
Message-Id: <20200211133946.746-1-penberg@scylladb.com>
The patch 759752947b explains why the .column_family method
of this statament implementation must be tuned to calculate
the column_family in some cases. However, to do this the global
storage_proxy is needed.
The proposal is to calculate the column_family in .validate
method, like it's done e.g. for function_statement-s, which
has storage_proxy reference at hands.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Get rid of numerous calls to get_local_stroage_proxy().get_db()
and use the storage proxy argument that's already avaliable in
most of them.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
- Removenode operation
It is used to remove a dead node out of the cluster. Existing nodes
pulls data from other existing nodes for the new ranges it own. It
pulls from one of the replicas which might not be the latest copy.
Fixes: #3003Fixes: #4208
Tests: update_cluster_layout_tests.py + replace_address_test.py + manual test
- Rebuild operation
It is used to get all the data this node owns form other nodes. It
pulls data from only one of the replicas which might not be the
latest copy.
Fixes: #3003Fixes: #4208
Tests: update_cluster_layout_tests.py + replace_address_test.py + manual test
With repair based node operations, we can resume previous failed
bootstrap. In order to do that, we need the bootstrap node uses the same
tokens as previous bootstrap.
Currently, we always use new tokens when we bootstrap, because we need
to stream all the ranges anyway. It does not matter if we use the same
tokens or not.
The rebuild and replace operations are similar because the token ring
does not change for both of them. Add a common helper to do rebuild and
replace with repair. It will be used by rebuild and replace operation
shortly.
It is used to sync data for node operations like bootstrap, decommission
and so on.
Unlike plain repair operation, the user of sync_data_with_repair() can
pass repair_neighbors object to specify the pre-calculated neighbors for
a range. If a mandatory neighbor is not available, the repair will fail
so that the upper layer can fail the node operation.
The original idea of prefixing alternator keyspace names with 'a#'
leveraged the fact that '#' is not a legal CQL character for keyspace
names. The idea is flawed though, since '#' proved to confuse
existing Scylla tools (e.g. nodetool).
Thus, the prefix is changed to more orthodox 'alternator_'.
It is possible to create such keyspaces with CQL as well, but then
the alternator CreateTable request would simply fail, because
the keyspace already exists, which is graceful enough.
Hiding alternator keyspaces and tables from CQL is another issue,
but there are other ways to distinguish them than a non-standard
prefix, e.g. tags.
Fixes#5883
The set_config registers lambdas that need db.local(), so
these routes must be registered after database is started.
Fixes: #5849
Tests: unit(dev), manual wget on API
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200219130654.24259-1-xemul@scylladb.com>
On Debian, we don't add xfsprogs/mdadm on package dependency, install on
scylla_raid_setup script instead.
Since xfsprogs/mdadm only needed for constructing RAID, we can move
dependencies to scylla_raid_setup too.
In order to decrease the developer's time spent on waiting
for boto3 to retry the request many times, the retry count
is configured to be 3.
Two major benefits:
- vastly decrease wait time when debugging a failing test
- for requests which are expected to fail, but return results
not compatible with boto3, execution time is decreased
Tests: alternator-test(local,remote)
Message-Id: <46a3a9344d9427df7ea55c855f32b8f0e39c9b79.1582285070.git.sarna@scylladb.com>
The nr_ranges_streamed denotes the number of ranges streamed
so far, but by the time the sending lambda is called this
counter is already incremented by the number of ranges to be
streamed in this call. And the variable is not used for
anything else but logging.
Fix this by swapping logging with incrementing.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200221101601.18779-1-xemul@scylladb.com>
This patch adds an alternative way to locate sstables by looking at
sstable sets in table objects:
scylla sstables -t
This may be useful for several things. One is to identify sstables
which are not attached to tables.
Another use case is to be able to use the command on older versions of
scylla which don't have sstable tracking.
Message-Id: <1582308099-24563-1-git-send-email-tgrabiec@scylladb.com>
I neither is used, we get the default behavior: only release is built
without stack guards.
With --disable-stack-guards all modes are built without stack guards.
With --enable-stack-guards all modes are built with stack guards.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200222012732.992380-1-espindola@scylladb.com>
The variable in question was used to check that the bootstrap mode
finishes correctly, but it was removed, becase this check was for
self-evident code and thus useless (dbca327b)
Later, the patch was reverted to keep track the bootstrap mode for
API is_cleanup_allowed call (a39c8d0e)
This patch is a reworked combination of both -- the variable is
kept for API sake, but in a much simpler manner.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200221101813.18945-1-xemul@scylladb.com>
The _scheduled_gossip_task timer needs token_metadata and thus should
be stopped before. However, this is not always the case.
The timer is armed in start_gossiping, which is called by storage_service
init_server_without_the_messaging_service_part, and is canceled inside
stop_gossiping, which in turn is called by drain_on_shutdown, which in
turn is registered too late.
If something fails between the internals of the init_server_... and
defered registration of drain_on_shutdown (lots of reasons) the timer is
not stopped and may run, thus accessing the freed token_metadata.
Bandaid this by scheduling stop_gossiping right after the gossiper
instances are created. This can be too early (before storage_service
starts gossiping) or too late (after drain_on_shutdown stops it), but
this function is re-entrable.
Fixes#5844
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200221085226.16494-1-xemul@scylladb.com>
* seastar 2b510220...cdda3051 (10):
> core: discard unused variable / function
> pollable_fd: use boost::intrusive_ptr rather than std::unique_ptr for lifecycle management
> build: check for pthread_setname_np()
> build: link against Threads::Threads
> future: Avoid recursion in do_for_each
> future: Expand description of parallel_for_each
> merge: Add content length limit to httpd
> tests/scheduling_group_test: verify current scheduling group is inherited as expected
> net: return future<> instead of subscription<>
> cmake: be more verbose when looking for libraries
Before this patch, we only supported the ReturnValues=NONE setting of the
PutItem, UpdateItem and DeleteItem operations.
This patch also adds full support for the ReturnValues=ALL_OLD option
in all three operation. This option directs Alternator to return the full
old (i.e., pre-modification) contents of the item.
We implement this as a RMW (read-modify-write) operation just as we do
other RMW operations - i.e., by default we use LWT, to ensure that we really
return the value of the item directly before the modification, the same
value that would have been used in a conditional expression if there was one.
NOTE: This implementation means one cannot use ReturnValues=ALL_OLD in
forbid_rmw write isolation mode. One may theorize that if we only need the
read-before-write for ReturnValues and not for a conditional expression,
it should have been enough to use a separate read (as we do in unsafe_rmw
isolation mode) before the write. But we don't have this "optimization" yet
and I'm not sure it's a valid optimization at all - see discussion in
a new issue #5851.
This patch completes the ReturnValues support for the PutItem and DeleteItem
operations. However, the third operation, UpdateItem, supports three more
ReturnValues modes: UPDATED_OLD, ALL_NEW and UPDATED_NEW. We do not yet
support those in this patch. If a user tries to use one of these three modes,
an informative error message will be returned. The three tests for these
three unimplemented settings continue to xfail, but the rest of the tests
in test_returnvalues.py (except one test of nested attribute paths) now
pass so their xfail flag is dropped.
Refs #5053
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200219135658.7158-1-nyh@scylladb.com>
Row cache needs to be invalidated whenever data in sstables
changes. Cleanup removes data from sstables which doesn't belong to
the node anymore, which means cache must be invalidated on cleanup.
Currently, stale data can be returned when a node re-owns ranges which
data are still stored in the node's row cache, because cleanup didn't
invalidate the cache."
Fixes#4446.
tests:
- unit tests (dev mode)
- dtests:
update_cluster_layout_tests.py:TestUpdateClusterLayout.simple_decommission_node_2_test
cleanup_test.py
* Pass raw::select_statement::parameters as lw_shared_ptr
* Some more const cleanups here and there
* lists,maps,sets::equals now accept const-ref to *_type_impl
instead of shared_ptr
* Remove unused `get_column_for_condition` from modification_statement.hh
* More methods now accept const-refs instead of shared_ptr
Every call site where a shared_ptr was required as an argument
has been inspected to be sure that no dangling references are
possible.
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200220153204.279940-1-pa.solodovnikov@scylladb.com>
Due to a bug the entire segment is written in one huge write of 32Mb.
The idea was to split it to writes of 128K, so fix it.
Fixes#5857
Message-Id: <20200220102939.30769-1-gleb@scylladb.com>
There may be other commitlog writes waiting for zeroing to complete, so
not using proper scheduling class causes priority inversion.
Fixes#5858.
Message-Id: <20200220102939.30769-2-gleb@scylladb.com>
Row cache needs to be invalidated whenever data in sstables changes. Cleanup removes
data from sstables which doesn't belong to the node anymore, which means cache must
be invalidated on cleanup.
Currently, stale data can be returned when a node re-owns ranges which data are still
stored in the node's row cache, because cleanup didn't invalidate the cache.
To prevent data that belongs to the node from being purged from the row cache, cleanup
will only invalidate the cache with a set of token ranges that will not overlap with
any of ranges owned by the node.
update_cluster_layout_tests.py:TestUpdateClusterLayout.simple_decommission_node_2_test
now passes.
Fixes#4446.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
This procedure will calculate ranges for cache invalidation by subtracting
all owned ranges from the sstables' partition ranges. That's done so as
to reduce the size of invalidated ranges.
Refs #4446.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
In order to avoid stack overflow issues represented by the attached
test case, rapidjson's parser now has a limit of nested level.
Previous iterations of this patch used iterative parsing
provided by rapidjson, but that solution has two main flaws:
1. While parsing can be done iteratively, printing the document
is based on a recursive algorithm, which makes the iteratively
parsed JSON still prone to stack overflow on reads.
Documents with depth 35k were already prone to that.
2. Even if reading the document would have been performed iteratively,
its destruction is stack-based as well - the chain of C++ destructors
is called. This error is sneaky, because it only shows with depths
around 100k with my local configuration, but it's just as dangerous.
Long story short, capping the depth of the object to an arguably large
value (39) was introduced to prevent stack overflows. Real life
objects are expected to rarely have depth of 10, so 39 sounds like
a safe value both for the clients and for the stack.
DynamoDB has a nesting limit of 32.
Fixes#5842
Tests: alternator-test(local,remote)
Message-Id: <b083bacf9df091cc97e4a9569aad415cf6560daa.1582194420.git.sarna@scylladb.com>
This patch causes inclusive and exclusive range deletes to be
distinguished in cdc log. Previously, operations `range_delete_start`
and `range_delete_end` were used for both inclusive and exclusive bounds
in range deletes. Now, old operations were renamed to
`range_delete_*_inclusive`, and for exclusive deletes, new operations
`range_delete_*_exclusive` are used.
Tests: unit(dev)
User reported an issue that after a node restart, the restarted node
is marked as DOWN by other nodes in the cluster while the node is up
and running normally.
Consier the following:
- n1, n2, n3 in the cluster
- n3 shutdown itself
- n3 send shutdown verb to n1 and n2
- n1 and n2 set n3 in SHUTDOWN status and force the heartbeat version to
INT_MAX
- n3 restarts
- n3 sends gossip shadow rounds to n1 and n2, in
storage_service::prepare_to_join,
- n3 receives response from n1, in gossiper::handle_ack_msg, since
_enabled = false and _in_shadow_round == false, n3 will apply the
application state in fiber1, filber 1 finishes faster filber 2, it
sets _in_shadow_round = false
- n3 receives response from n2, in gossiper::handle_ack_msg, since
_enabled = false and _in_shadow_round == false, n3 will apply the
application state in fiber2, filber 2 yields
- n3 finishes the shadow round and continues
- n3 resets gossip endpoint_state_map with
gossiper.reset_endpoint_state_map()
- n3 resumes fiber 2, apply application state about n3 into
endpoint_state_map, at this point endpoint_state_map contains
information including n3 itself from n2.
- n3 calls gossiper.start_gossiping(generation_number, app_states, ...)
with new generation number generated correctly in
storage_service::prepare_to_join, but in
maybe_initialize_local_state(generation_nbr), it will not set new
generation and heartbeat if the endpoint_state_map contains itself
- n3 continues with the old generation and heartbeat learned in fiber 2
- n3 continues the gossip loop, in gossiper::run,
hbs.update_heart_beat() the heartbeat is set to the number starting
from 0.
- n1 and n2 will not get update from n3 because they use the same
generation number but n1 and n2 has larger heartbeat version
- n1 and n2 will mark n3 as down even if n3 is alive.
To fix, always use the the new generation number.
Fixes: #5800
Backports: 3.0 3.1 3.2
Previously we required MODIFY permissions on all materialized views in
order to modify a table. This is wrong, because the views should be
synced to the table unconditionally. For the same reason,
users *shouldn't* be granted MODIFY on views, to prevent them manually
changing (and breaking) a view.
This patch removes an explicit permissions check in
modification_statement introduced by 65535b3. It also tests that a
user can indeed modify a table they are allowed to modify, regardless
of lacking permissions on the table's views and indices.
Fixes#5205.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
"
This series ensures the server more often than not initializes
raw_cql_statement, a variable responsible for holding the original
CQL query, and adds logging events to all places executing CQL,
and logs CQL text in them.
A prepared statement object is the third incarnation of
parser output in Scylla:
- first, we create a parsed_statement descendent.
This has ~20 call sites inside Cql.g
- then, we create a cql_statement descendent, at ~another 20 call sites
- finally, in ~5 call sites we create a prepared statement object,
wrapping cql_statement. Sometimes we use cql_statement object
without a prepared statement object (e.g. BATCHes).
Ideally we'd want to capture the CQL text right in the parser, but
due to complicated transformations above that would require
patching dozens of call sites.
This series moves raw_cql_statement from class prepared_statement
to its nested object, cql_statement, batches, and initializes this
variable in all major call sites. View prepared statements and
some internal DDL statements still skip setting it.
"
* 'query_processor_trace_cql_v2' of https://github.com/kostja/scylla:
query_processor: add CQL logging to all major execute call sites.
query_procesor: move raw_cql_statement to cql_statement
query_processor: set raw_cql_statement consistently
In the state of Alternator in docs/alternator/alternator.md, we said that
BatchWriteItem doesn't check for duplicate entries. That is not true -
we do - and we even have tests (test_batch_write_duplicate*) to verify that.
So drop that comment.
Refs #5698. (there is still a small bug in the duplicate checking, so still
leaving that issue open).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200219164107.14716-1-nyh@scylladb.com>
In commit 388b492040, which was only supposed
to move around code, we accidentally lost the line which does
_executor.local()._stats.total_operations++;
So after this commit this counter was always zero...
This patch returns the line incrementing this counter.
Arguably, this counter is not very important - a user can also calculate
this number by summing up all the counters in the scylla_alternator_operation
array (these are counters for individual types of operations). Nevertheless,
as long as we do export a "scylla_alternator_total_operations" metric,
we need to correctly calculate it and can't leave it zero :-)
Fixes#5836
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200219162820.14205-1-nyh@scylladb.com>
It will store the ranges to be invalidated in row cache on compaction
completion. Intended to be used by cleanup compaction.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
compaction_completion_desc will eventually store more information that can be
customized by the compaction type.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
This descriptor contain all information needed for table to be properly
updated on compaction completion. A new member will be added to it soon,
which will store ranges to be invalidated in row cache on behalf of
cleanup compaction.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
... when clustering key is unavailable' from Benny
This series fixes null pointer dereference seen in #5794efd7efe cql3: generate_base_key_from_index_pk; support optional index_ck
7af1f9e cql3: do_execute_base_query: generate open-ended slice when clustering key is unavailable
7fe1a9e cql3: do_execute_base_query: fixup indentation
Fixes#5794
Branches: 3.3
Test: unit(dev) secondary_indexes_test:TestSecondaryIndexes.test_truncate_base(debug)
* bhalevy/fix-5794-generate_base_key_from_index_pk:
cql3: do_execute_base_query: fixup indentation
cql3: do_execute_base_query: generate open-ended slice when clustering key is unavailable
cql3: generate_base_key_from_index_pk; support optional index_ck
There has been recently discussed several problems when stopping
migration manager and features.
The first issue is with migration manager's schema pull sleeping
and potentially using freed migration manager instances.
Two others are with freeing database and migration manager before
features they wait for are enabled.
1. Only call base_ck = generate_base_key_from_index_pk<...
if the base schema has a clustering key.
2. Only call command->slice.set_range(*_schema, base_pk, ...
if the base schema has a clustering key,
otherwise just create an open ended range.
Proposed-by: Piotr Sarna <sarna@scylladb.com>
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
When dropping a table, the table and its views are dropped
in parallel, this is not a problem as for itself but we
have mechanism to snapshot a deleted table before the
actual delete. When a secondary index is removed, in the
snapshot process it looks for it's schema for creating the
schema part of the snapshot but if the main table is already
gone it will not find it.
This commit serializes views and main table removals and
removes the views prior to the tables.
See discussion on #5713
Tests:
Unit tests (dev)
dtest - A test that failed on "can't find schema" error
Fixes#5614
* eliran/serialize_table_views_deletion:
Materialized Views: serialize tables and views creation
Materialized Views: drop materialized views before tables
Now the database keeps reference on feature service, so we
can listen on the feature in it directly.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The maybe_schedule_schema_pull waits for schema_tables_v3 to
become available. This is unsafe in case migration manager
goes away before the feature is enabled.
Fix this by subscribing on feature with feature::listener and
waiting for condition variable in maybe_schedule_schema_pull.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The get_string_attribute() function used attribute_value->GetString()
to return an std::string. But this function does not actually return a
std::string - it returns a char*, which gets implicitly converted to
an std::string by looking for the first null character. This lookup is
unnecessary, because rjson already knows the length of the string, and
we can use it.
This patch is just a cleanup and a very small performance improvement -
I do not expect it fixes any bugs or changes anything functional, because
JSON strings anyway cannot contain verbatim nulls.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200219101159.26717-1-nyh@scylladb.com>
When called from indexed_table_select_statement::do_execute_base_query,
old_paging_state->get_clustering_key() may return un-engaged
optional<clustering_key>. Dereferencing it unconditionally crashes
scylla as seen in https://github.com/scylladb/scylla/issues/5794Fixes#5794
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
The sleep is interrupted with the abort source, the "wait" part
is done with the existing _background_tasks gate. Also we need
to make sure the gate stays alive till the end of the function,
so make use of the async_sharded_service (migration manager is
already such).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This change serializes tables and views creation. The
changes purpose is to avoid future possible races due to
a view searching for its base table information while the
later haven't been created yet.
When dropping a table, the table and its views are dropped
in parallel, this is not a problem as for itself but we
have mechanism to snapshot a deleted table before the
actual delete. When a secondary index is removed, in the
snapshot process it looks for its schema for creating the
schema part of the snapshot but if the main table is already
gone it will not find it.
This commit serializes views and main table removals and
removes the views prior to the tables.
See discussion on https://github.com/scylladb/scylla/pull/5713
Tests:
Unit tests (dev)
dtest - A test that failed on "can't find schema" error
Fixes#5614
De-pointerize cql3 code APIs further: change some call sites
to pass `schema` as const-ref instead of `shared_ptr`.
Affected functions known to be expecting always non-null
pointer to schema and don't store or pass the pointer somewhere
else, assuming it's safe to give them just a reference.
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200218142338.69824-1-pa.solodovnikov@scylladb.com>
This patch fixes a bug that appears because of an incorrect interaction
between counters and hinted handoff.
When a counter is updated on the leader, it sends mutations to other
replicas that contain all counter shards from the leader. If
consistency level is achieved but some replicas are unavailable, a hint
with mutation containing counter shards is stored.
When a hint's destination node is no longer its replica, it is
attempted to be sent to all its current replicas. Previously, if the
cluster did not have the feature HINTED_HANDOFF_SEPARATE_CONNECTION
enabled, storage_proxy::mutate function would be used for the purpose of
sending the hint. It was incorrect because that function treats
mutations for counter tables as mutations containing only a delta (by
how much to increase/decrease the counter). These two types of mutations
have different serialization format, so in this case a "shards" mutation
is reinterpreted as "delta" mutation, which can cause data corruption to
occur.
This patch fixes the case when HINTED_HANDOFF_SEPARATE_CONNECTION is
disabled, and uses storage_proxy::mutate_internal, which treats "shards"
mutation as regular mutations - which is the correct behavior.
Refs #5833.
Tests: unit(dev)
Refs #817
Truncation is potentially long. It has its own timeout in storage
proxy/rpc. This value should probably also be higher than default
timeout.
Message-Id: <20200218135926.26522-1-calle@scylladb.com>
The clear_snapshot method signature was modified and accept a table name
parameter.
This patch adds an empty table name to the clear_snapshot test so it
would compile and pass.
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
There are cases when it is useful to delete specific table from a
snapshot.
An example is when a snapshot is used for backup. Backup can take a long
period of time, during that time, each of the tables can be deleted once
it was backup without waiting for the entire backup process to
completed.
This patch adds such an option to the database and to the storage_service
wrapping method that calls it.
If a table is specified a filter function is created that filter only
the column family with that given name.
This is similar to the filtering at the keyspace level.
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
This patch adds additional tests for the ReturnValues feature to make the
test even more comprehensive. As this feature is not yet implemented in
Alternator (see issue #5053), all tests XFAIL on Alternator - except two
tests for the trivial "NONE" mode which is already supported. As usual
all tests pass on DynamoDB.
This patch also splits the tests for the ReturnValues parameter in the
UpdateItem operation into multiple tests, each testing one of the different
modes which DynamoDB supports - NONE, ALL_OLD, UPDATED_OLD, ALL_NEW and
UPDATED_NEW. The separate tests will be useful if we implement this feature
incrementally - so the separate modes can be tested separately.
Refs #5053.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200218085618.5584-1-nyh@scylladb.com>
This patch adds a new document, docs/protocols.md, about all the different
protocols which Scylla supports - and the different ports which they use.
This includes Scylla's internal protocol, user-facing protocols (CQL, Thrift,
DynamoDB, Redis, JMX) and things inbetween (REST API, Prometheus).
I wrote this document after being frustrated that when I see a port number
(e.g., "7000") or a port option name (e.g., "storage_port") it's hard to
figure out what they actually are - or why they are given such strange
names. The intention is that this file can easily be searched for option
names, for more familiar names (e.g., "CQL"), and a reader can get the
whole story - including some pointers to relevant part of the code (this
part of the document can be improved further - in this version this only
exists for the internal protocol).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200217172049.25510-1-nyh@scylladb.com>
* seastar c7c249f67d...2b51022073 (8):
> dns_test: Test with seastar.io instead of www.google.com
> sharded: fix move constructor for peering_sharded_service
Fixes#5814.
> tests: Delete Seastar.dist
> reactor: distinguish structs from classes when befriending
> util/tuple_utils.hh: avoid redundant move
> io_request: do not include fmt/format.h
> reactor: cleanup write_some leftover
> posix: change the signature of accept/try_accept
"
This change set is comprised of several unrelated patches regarding
some cleanups in cql3 layer code.
Most of the changes are aimed at eliminating superfluous `shared_ptr`
usages. In places where it can be safely assumed that objects passed
to the function are considered non-null and constant, these places
were adjusted to use passing as const ref instead.
Other changes incude eliminating unused arguments at some functions
and replacing usages of `shared_ptr<service::pager::paging_state>`
to use `lw_shared_ptr` instead, since `pager::paging_state` is final.
Tests: unit(dev, debug)
"
* 'feature/cql_cleanups_4' of https://github.com/ManManson/scylla:
cql3: minor sweeps through the cql layer code to reduce shared_ptrs count
cql3: change some function signatures to accept const references
cql3: change signatures of several functions to return crefs instead of pointers
cql3: remove unused argument at functions::castas_functions::get
paging_state: switch from shared_ptr to lw_shared_ptr
When replaying a hint with a destination node that is no longer in the
cluster, it will be sent with cl=ALL to all its new replicas. Before
this patch, the MUTATION verb was used, which causes such hints to be
handled on the same connection and with the same priority as regular
writes. This can cause problems when a large number of hints is
orphaned and they are scheduled to be sent at once. Such situation
may happen when replacing a dead node - all nodes that accumulated hints
for the dead node will now send them with cl=ALL to their new replicas.
This patch changes the verb used to send such hints to HINT_MUTATION.
This verb is handled on a separate connection and with streaming
scheduling group, which gives them similar priority to non-orphaned
hints.
Refs: #4712
Tests: unit(dev)
" from Botond
Nodetool scrub rewrites all sstables, validating their data. If corrupt
data is found the scrub is aborted. If the skip-corrupted flag is set,
corrupt data is instead logged (just the keys) and skipped.
The scrubbing algorithm itself is fairly simple, especially that we
already have a mutation stream validator that we can use to validate the
data. However currently scrub is piggy-backed on top of cleanup
compaction. To implement this flag, we have to make scrub a separate
compaction type and propagate down the flag. This required some
massaging of the code:
* Add support for more than two (cleanup or not) compaction types.
* Allow passing custom options for each compaction type.
* Allow stopping a compaction without the manager retrying it later.
Additionally the validator itself needed some changes to allow different
ways to handle errors, as needed by the scrub.
Fixes: #5487
* https://github.com/denesb/nodetool-scrub-skip-corrupted/v7:
table: cleanup_sstables(): only short-circuit on actual cleanup
compaction: compaction_type: add Upgrade
compaction: introduce compaction_options
compaction: compaction_descriptor: use compaction options instead of
cleanup flag
compaction_manager: collect all cleanup related logic in
perform_cleanup()
sstables: compaction_stop_exception: add retry flag
mutation_fragment_stream_validator: split into low-level and
high-level API
compaction: introduce scrub_compaction
compaction_manager: scrub: don't piggy-back on upgrade_sstables()
test: sstable_datafile_test: add scrub unit test
CDC can get all it needs from a config and does not need
partitioner.
For base table specific operations CDC is using partitioner
from that table (obtained with schema::get_partitioner).
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
All the places that use partitioner have been switched
to not use global partitioner any more and we can stop
setting it in this test.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
random_schema already has a _schema field which in turn
has a get_partitioner() function. Store partitioner
in random_schema is redundant.
At the moment all uses of random_schema are based on
default partitioner so it is not necessary to set it
explicitly. If in the future we need random_schema to
work with other partitioners we will add the constructor
back and fix the creation of _schema to contain it. It's
not needed now though.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
parse functions now take const schema& which allows
them to reach a partitioner. It's safe to take schema
by const& because the only caller takes the schema
from an sstable object.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
and replace all calls to dht::global_partitioner().get_token
dht::get_token is better because it takes schema and uses it
to obtain partitioner instead of using a global partitioner.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
and replace all dht::global_partitioner().decorate_key
with dht::decorate_key
It is an improvement because dht::decorate_key takes schema
and uses it to obtain partitioner instead of using global
partitioner as it was before.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Take const schema& as a parameter of shard_of and
use it to obtain partitioner instead of calling
global_partitioner().
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
ring_position_exponential_sharder calls global_partitioner
in one constructor. Luckily the constructor is never used so
we can remove that constructor.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This requires a change in a repair that uses
selective_token_range_sharder.
Repair performs operation on a set of tables. We will have to
make sure that all of that tables use the same partitioner.
This is achieved by adding a check to a repair_info constructor.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Remove ring_position_range_sharder(nonwrapping_range<ring_position>)
which calls another constructor with partitioner obtained with
dht::global_partitioner().
Fix all the places the removed constructor was used and obtain
partitioner from schema instead.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
i_partitioner.hh is widely included while sharders are used
only in 6 places so there's no need to include them in
the whole codebase.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The next patch is moving sharders to a separate header.
ring_position_exponential_vector_sharder is not used anywhere
so instead of just silently removing it with the move, this
commit is separated to make it clear the class is removed.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The plan is to remove dht::global_partitioner()
and use schema::get_partitioner() instead.
This will allow a usage of per schema/table partitioner
instead of a single global partitioner everywhere.
Initially schema::get_partitioner will call
dht::global_partitioner. After all the calls
to dht::global_partitioner are switched to
schema::get_partitioner, the ability to set per schema
partitioner will be implemented.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
clustering_interval_set is a rarely used class, but one that requires
boost/icl, which is quite heavyweight. To speed up compilation, move
it to its own header and sprinkle #includes where needed.
Tests: unit (dev)
Message-Id: <20200214190507.1137532-1-avi@scylladb.com>
Merged patch series from Avi Kivity:
token_metadata is a heavyweight class with heavyweight includes
(boost/icl) it is a good candidate for the pimpl pattern, which
this series implements.
Tests: unit (dev)
https://github.com/avikivity/scylla token_metadata-pimplification/v1
Avi Kivity (6):
locator: token_metadata: use non-deduced return type for ring_range()
locator: token_metadata: pimplify
locator: token_metadata: make token_metadata_impl::tokens_iterator a
non-nested class
locator: token_metadata: pimplify tokens_iterator
locator: token_metadata: move implementation classes to .cc
locator: token_metadata: remove unused include "query-request.hh"
locator/token_metadata.hh | 783 +---------------
locator/token_metadata.cc | 1338 ++++++++++++++++++++++++++-
test/boost/sstable_datafile_test.cc | 1 +
3 files changed, 1332 insertions(+), 790 deletions(-)
Message-Id: <20200214184954.1130194-1-avi@scylladb.com>
Merged patch series from Piotr Sarna:
This miniseries implements graceful shutdown for alternator
by introducing two mechanisms:
- refusing to accept new requests during shutdown
by stopping the HTTP/HTTPS server(s)
- guarding pending requests with a gate, so that
when alternator server is stopped, no in-flight
alternator requests are being processed
Fixes#5781
Tests: manual(stopping Scylla in the middle of alternator-test
multiple times, used to crash every time
with local_is_initialized() assertion)
Piotr Sarna (3):
alternator: implement stopping alternator server
alternator: guard pending alternator requests with a gate
alternator: guard alternator-specific handlers with a gate
alternator/server.cc | 64 +++++++++++++++++++++++++++++++++++---------
alternator/server.hh | 4 +++
main.cc | 11 ++++++--
3 files changed, 64 insertions(+), 15 deletions(-)
Convert some more helper functions to accept const reference to
column_specification and column_identifier instead of shared_ptr.
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
This patch continues the effort of reducing shared_ptr's count
in the different APIs throughout the cql3 code tree.
These functions now pass cref to column_specification instead of
shared_ptr:
* multiple variants of `validate_assignable_to`
* sets::value_spec_of
* lists::value_spec_of
* lists::index_spec_of
* lists::uuid_index_spec_of
* tuples::component_spec_of
* user_types::field_spec_of
These functions don't pass the shared_ptr around down the call
hierarchy, also obviously assuming that the column_specification
passed is always non-null.
So it's safe to assume that they don't borrow the ownership of
the pointer or knowingly prolongate lifetime of the object
pointed by.
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
The following functions now accept const reference to
column_specification instead of shared_ptr:
* lists::index_spec_of
* lists::value_spec_of
* lists::uuid_index_spec_of
* sets::value_spec_of
Changed maps::value_spec_of and maps::key_spec_of signatures
to accept const ref instead of non-const ref to
column_specification.
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Change the way `service::pager::paging_state` is passed around
from `shared_ptr` to `lw_shared_ptr`. It's safe since
`paging_state` is final.
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Helper function for registering metrics for an endpoint,
register_metrics_for(ep) depends on an external state to be updated.
It checks if given metrics are added to a map, and if not, the metrics
are registered, but the mentioned map is expected to be updated
by the caller (e.g. get_ep_stat). This behaviour is error-prone,
because calling this function twice will result in an exception,
since registering metrics twice is not allowed.
Refs #5697
Message-Id: <5a9ddccf52861749dbda4204b5d098cc77bc51eb.1581855769.git.sarna@scylladb.com>
Alternator is able to serve more requests than its database operations,
e.g. a health check and returning the list of its nodes.
These operation, for safety, are no also guarded by the pending
requests gate.
In order to make sure that pending alternator requests are processed
during shutdown, a gate for each shard is introduced. On shutdown,
each gate will be closed and all in-progress operations will be waited upon.
Fixes#5781
Stopping Scylla with alternator enabled is not clean,
because the server does not stop accepting requests
on shutdown, which leads to use-after-free events.
The first step towards a cleaner solution is to implement
alternator_server::stop(), which stops the HTTP/HTTPS servers.
Refs #5781
The instructions in docs/alternator/getting-started.md on how to run
Alternator with docker are outdated and confusing, so this patch updates
them.
First, the instructions recommended the scylladb/scylla-nightly:alternator
tag, but we only ever created this tag once, and never updated it. Since
then, Alternator has been constantly improving, and we've caught up on
a lot of features, and people who want to test or evaluate Alternator
will most likely want to run the latest nightly build, with all the latest
Alternator features. So we update the instructions to request the latest
nightly build - and mention the need to explictly do "docker pull" (without
this step, you can find yourself running an antique nightly build, which
you downloaded months ago!). This instruction can be revisited once
Alternator is GAed and not improving quickly and we can then recommend to
run the latest stable Scylla - but I think we're not there yet.
Second, in recent builds, Alternator requires that the LWT feature is
enabled, and since LWT is still experimental, this means that one needs
to add "--experimental 1" to the "docker run" command. Without it, the
command line in getting-started.md will refuse to boot, complaining that
Alternator was enabled but LWT wasn't. So this patch adds the
"--experimental 1" in the relevant places in the text. Again, this
instruction can and should be revisited once LWT goes out of experimental
mode.
Fixes#5813
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200216113601.9535-1-nyh@scylladb.com>
This patch adds to Alternator's Query operation full support for the
KeyConditionExpression parameter - a newer syntax for specifying which
partition and which sort-key range are to be queried. The older syntax
for the same thing, "KeyConditions", was already supported by Alternator.
The patch also includes additional test cases for more corner cases
discovered during the development. After this patch, all 47 test cases
in test_key_condition_expression.py pass on Alternator (and, of course,
also on DynamoDB).
One interesting thing to note about this patch is that it does *not*
include a new parser for the KeyConditionExpression syntax. It turns out
that we need - to be fully compatible with DynamoDB - to use the
already existing parser for *ConditionExpression* syntax, and then forbid
certain things not allowed in KeyConditionExpression (you can see a lot
of examples in code comments and in the tests included in this patch).
Most importantly, allowing the full ConditionExpression syntax also
means we allow completely useless parentheses on key conditions, e.g.,
'((p=:p) AND (c=:c))'. While the KeyConditionExpression documentation
doesn't mention allowing these parentheses, DynamoDB does support them -
and it turns out that boto3 uses them when you use its condition builders,
as we do in one test case (test_query_key_condition_expression).
Fixes#5037.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200213192509.32685-4-nyh@scylladb.com>
We had a get_key_from_typed_value() utility function to decode a
JSON-encoded value with a known type (the JSON encoding is a map whose
key is the type, the value always a string because all possible key types -
string, bytes and number, are encoded as strings).
However, the function was less useful than it could have been - it was
missing one check for a malformed object (a check which only appeared in
one of its callers), it unnecessarily received the column's expected type
(all the callers passed it the given key column's type).
The cleaned up function will be more useful for the following patch
to support KeyConditionExpression, which wants to reuse it.
While at it, this patch also uses rjson::to_string_view(it->value)
instead of the less correct it->value.GetString() (the latter relies
on null-termination, which is actually true for JSON strings, but there
is no reason to rely on it).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200213192509.32685-3-nyh@scylladb.com>
conditions.cc contains a useful utility function for extracting (without
copying) a string_view from a rjson::value which is known to contain a
string. This function will be useful in more Alternator code, so let's
extract it to rjson.hh, with the name rjson::to_string_view()
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200213192509.32685-2-nyh@scylladb.com>
The table::flush_streaming_mutations is used in the days when streaming
data goes to memtable. After switching to the new streaming, data goes
to sstables directly in streaming, so the sstables generated in
table::flush_streaming_mutations will be empty.
It is unnecessary to invalidate the cache if no sstables are added. To
avoid unnecessary cache invalidating which pokes hole in the cache, skip
calling _cache.invalidate() if the sstables is empty.
The steps are:
- STREAM_MUTATION_DONE verb is sent when streaming is done with old or
new streaming
- table::flush_streaming_mutations is called in the verb handler
- cache is invalidated for the streaming ranges
In summary, this patch will avoid a lot of cache invalidation for
streaming.
Backports: 3.0 3.1 3.2
Fixes: #5769
* seastar 6d2ed8cdc...c7c249f67 (3):
> reactor: fix issue with hrtimer completions being lost
> Merge "refactor network and storage I/O handling in backend code" from Glauber
> reactor: don't call set_heap_profiling_enable() if not needed
Running cdc_test binary fails with a segmentation fault
when run with --smp 1, because test_cdc_across_shards
assumes shard count to be >=2. This patch skips the test case
when run with a single shard and produces a log warning.
Message-Id: <9b00537db9419d8b7c545ce0c3b05b8285351e7d.1581600854.git.sarna@scylladb.com>
There is a case in current PAXOS implementation where timeout is
returned because the code cannot guaranty whether the value is accepted
or not in case of a contention. The counter will help to correlate this
condition with failed requests.
Message-Id: <20200211160653.30317-2-gleb@scylladb.com>
This series implements keyspace-per-table approach for Alternator.
The changes are as follows:
- when a table is created, its keyspace is created first
- after table deletion, its keyspace is deleted as well;
works with views too, since these must be deleted
before the base table is dropped
- instead of SimpleStrategy, network topology is used
Keyspaces are created with a prefix not legal from CQL - 'a#'.
I validated that even though not reachable via CQL, keyspaces
created with # character work well and produce correct directories,
restarts work flawlessly too.
Fixes#5611
Refs #5596
Tests: alternator(local, remote)
Piotr Sarna (3):
alternator: switch to keyspace-per-table approach
alternator: move to NetworkTopologyStrategy
alternator-test: add test for recreating a table
Cells in CDC logs used to be created while completely neglecting TTLs
(the TTLs from cdc = {...'ttl':600}). This patch adds TTLs to all cells;
there are no row markers, so wee need not set TTL there.
Fixes#5688
* jul-stas/5688-set-ttl-in-cdc-log-table:
tests/cdc: added test for TTL on log table cells
cdc: set TTLs on CDC log cells
Merged patch series from Piotr Sarna:
This series addresses and fixes#5758 by providing less terse
configuration for write isolation. Before the patch,
suggested values for alternator write isolation policies was one of
'f', 'a', 'o', 'u', which are not really descriptive.
The code actually checks only the first character from the tag value,
but now the input is validated to allow only specific, expressive values:
* 'a', 'always', 'always_use_lwt' - always use LWT
* 'o', 'only_rmw_uses_lwt' - use LWT only for requests that require
read-before-write
* 'f', 'forbid', 'forbid_rmw' - forbid statements that need read-before-
write. Using such statements
(e.g. UpdateItem with ConditionExpression) will result in an error
* 'u', 'unsafe', 'unsafe_rmw' - (unsafe) perform read-modify-write without
any consistency guarantees
Using other values will result in an error.
This series comes with tests and docs updates.
Fixes#5758
Tests: alternator-test(local,remote)
Piotr Sarna (5):
alternator: move rmw_operation to a header
alternator: add validating write_isolation tag
alternator-test: add test for write isolation tag
alternator-test: mark write isolation tests scylla_only
docs: update write isolation documentation
alternator-test/test_condition_expression.py | 10 +-
alternator-test/test_tag.py | 9 +
alternator/executor.cc | 163 +++++++------------
alternator/rmw_operation.hh | 99 +++++++++++
docs/alternator/alternator.md | 8 +-
5 files changed, 173 insertions(+), 116 deletions(-)
Empty values (zero-sized string in serialized form) were not
handled properly in serialize routines for floating types and
uuids, which led to runtime exceptions and failing tests as
described in https://github.com/scylladb/scylla/issues/5782.
Also fix validation visitor to handle empty values properly.
There already was the code in place that took into
consideration zero-sized values. But it was trying to read
some bytes regardless of that (e.g. for timeuuid values),
even if there is none to read.
Tests: unit(dev, debug)
Fixes: #5782
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200213130021.31598-1-pa.solodovnikov@scylladb.com>
On start, test.py cleans up testlog directory.
The cleanup file search pattern was shell style, not python
glob style, which led to .log files being left around
between runs.
Message-Id: <20200212204047.22398-9-kostja@scylladb.com>
Always open the log file first, this will be necessary to append
output to it in case the test timed out or didn't start.
Message-Id: <20200212204047.22398-5-kostja@scylladb.com>
To be able to easily see what tests have failed as they run,
print failed tests on their own line even if --verbose switch is off.
Message-Id: <20200212204047.22398-4-kostja@scylladb.com>
test.py used a functional programming cookie pattern to
carry tabular console output state, convert this cookie
to an object.
In order to make console output more pretty we'll need to
add more state to it, and keeping this state in a tuple
would be too messy.
Message-Id: <20200212204047.22398-3-kostja@scylladb.com>
In order to pimplify token_metadata_impl::tokens_iterator, we must make it
a non-nested class, since eventually token_metadata_impl will be an incomplete
class for users and nested classes cannot be forward declared. So this patch
makes it a non-nested class. Two inline functions that referred to it were
moved out of class scope so they can see the definition.
No functional changes.
token_metadata is a heavyweight class, with heavyweight include
dependencies (icl, which has tens of thousands of lines in headers),
heavyweight methods, but it rarely used. So it is a classic candidate
for pimmplication.
This patch splits off the implementation into token_metadata_impl
and leaves token_metadata as a forwarding class. Actual movement
of the code is left to a later patch to ease review.
Notes:
- some constructors were made public due to limitations of std::make_unique
- a few token_metadata methods pass *this along to external functions, so we
now pass the holder object as "unpimplified_this" to support this.
Now that we have the necessary infrastructure to do actual scrubbing,
don't rely on `upgrade_sstables()` anymore behind the scenes, instead do
an actual scrub.
Also, use the skip-corrupted flag.
A specialized compaction subclass for executing a scrub compaction.
`scrub_compaction` supplies a specialized reader which will validate its
input and stop on the first error. If it is configured with
`skip_corrupted`, it will instead skip bad data, logging it.
The low-level validator allows fine-grained validation of different
aspects of monotonicity of a fragment stream. It doesn't do any error
handling. Since different aspects can be validated with different
functions, this allows callers to understand what exactly is invalid.
The high-level API is the previous fragment filter one. This is now
built on the low-level API.
This division allows for advanced use cases where the user of the
validator wants to do all error handling and wants to decide exactly
what monotonicity to validate. The motivating use-case is scrubbing
compaction, added in the next patches.
Write isolation tags now accept only a small set of valid values.
The test case ensures that all valid values are accepted
and that invalid values return an error.
In order to prevent users from using incorrect write isolation
configuration, a set of allowed values is introduced.
When tagging a resource (which is considered rare), a tag
will only be allowed if it belongs to the allowed set.
rmw_operation is a class with a public interface, including
a write_isolation enum and a fixed tag name for its configuration.
For convenience, it's moved to a header file, so that code
from executor.cc can use the definitions regardless of their
position in the source file - it prevents reordering functions
just to make sure that rmw_operation is defined before a function
that uses its attributes.
The first iteration of keyspace-per-table approach for alternator
revealed an issue with recreating a table after deleting it.
This test case was used as a regression check.
Imstead of SimpleStrategy, NetworkTopologyStrategy is used
for setting up the replication configuration for alternator tables.
Replication factor 3 is used along with a local datacenter,
unless alternator discovers that it's running on a test cluster with
less than 3 nodes - then, RF is reduced accordingly and emits a warning,
which was also the case for SimpleStrategy.
Instead of a monolith alternator keyspace, each table creates its own
keyspace, named in the following pattern: `a#TABLE_NAME`.
The `a#` prefix contains an illegal CQL character in order to ensure
that these keyspaces are never created via CQL.
raw_cql_statement is a member of prepared_statement which
is not set in its constructor because prepared_statement
constructor has too many call sites inside cql_statement
hierarchy.
cql_statement and prepared_statement dependency form a
cycle and long term it obviously should be fixed.
As a quick fix to query processor tracing, consistently
assign raw_cql_statement in all prepared_statement
usage sites.
The update generation path must track and apply all tombstones,
both from the existing base row (if read-before-write was needed)
and for the new row. One such path contained an error, because
it assumed that if the existing row is empty, then the update
can be simply generated from the new row. However, lack of the
existing row can also be the result of a partition/range tombstone.
If that's the case, it needs to be applied, because it's entirely
possible that this partition row also hides the new row.
Without taking the partition tombstone into account, creating
a future tombstone and inserting an out-of-order write before it
in the base table can result in ghost rows in the view table.
This patch comes with a test which was proven to fail before the
changes.
Branches 3.1,3.2,3.3
Fixes#5793
Tests: unit(dev)
Message-Id: <8d3b2abad31572668693ab585f37f4af5bb7577a.1581525398.git.sarna@scylladb.com>
These series were born when working on debugging (missing)
query processor trace-level logging, and trying to identify
all entry points into parsed_statement::prepare().
Unfortunately I was unable to easily merge prepared_statement
and cql_statement objects.
Rationale for individual patches is given in commit comments.
While Alternator doesn't yet support creating a table with streams
(i.e., CDC) turned on, we should only failed the creation if streams
were really turned on. If the StreamSpecification option exists, but
does *not* ask to turn on streams, we should not fail the creation -
and this patch fixes this.
This patch also adds two tests - one where StreamSpecification is
passed but does not ask to turn on streams (so table creation should
succeed), and another test which explicitly requests to turn on
streams. The second test still xfails on Alternator, and should continue
to do so until we implement streams (we do *not* want to silently
ignore a request to turn on streams).
Fixes#5796
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200212100546.16337-1-nyh@scylladb.com>
boost/regex has huge header dependencies amounting to tens of thousands of
lines. This are now replicated in 167 translation units.
This patch converts like_matcher to use the pointer-to-implementation
idiom, which reduces the number of translations including boost/regex
to 28.
Since regular expressions are relatively expensive, and like_matcher is
relatively rare, the extra memory usage and run time will be
negligible.
Message-Id: <20200211170152.809554-1-avi@scylladb.com>
cql3 has cql_statement, parsed_statement and prepared_statement
classes, which, largely, stand for the same thing. prepared was
an alias for prepared_statement which only required an extra
tag jump in IDE and carried no meaning.
All internal execution always uses query text as a key in the
cache of internal prepared statements. There is no need
to publish API for executing an internal prepared statement object.
The folded execute_internal() calls an internal prepare() and then
internal execute().
execute_internal(cache=true) does exactly that.
Rename an overloaded function process() to execute_direct().
Execute direct is a common term for executing a statement
that was not previously prepared. See, for example
SQLExecuteDirect in ODBC/SQL CLI specification,
mysql_stmt_execute_direct() in MySQL C API or EXECUTE DIRECT
in Postgres XC.
The list_snapshot API, uses http stream to stream the result to the
caller.
It needs to keep all objects and stream alive until the stream is closed.
This patch adds do_with to hold these objects during the lifetime of the
function.
Fixes#5752
Refs #4924
truncate_exception should, like its origin counterpart, set
error code to TRUNCATE_ERROR, not PROTOCOL_ERROR.
tests: unit + partial dtest
Message-Id: <20200212100920.14478-1-calle@scylladb.com>
* seastar 1c7bccc500...6d2ed8cdc6 (11):
> connect_test: keep socket alive until the end.
> Merge "Add timeout to smp::submit_to() and friends" from Botond
> reactor: use reference to addrlen in accept
> tests: stall_detector_test: use same clock as in test as in the detector
> reactor: fallback to epoll backend when fs.aio-max-nr is too small
> util: move read_sys_file_as() from iotune to seastar header, rename read_first_line_as()
> core/resources: fix cpuset error
> distributed_tests: increase sleep time further
> core: thread: Fix compilation error in comment
> reactor: specialize the pollable_fd_state
> build: Use with -fstack-clash-protection when using guard pages
"
Lots of code needs storage_service just to get token_metadata from.
This creates unwanted dependency loops and increases the use of
global storage_service instance.
This set keeps the sharded<locator::token_metadata> on main's stack
and carries the references where needed. This removes the dependency
on storage_service from:
- storage_proxy
- gossiper
- redis
- batchlog manager
and makes the database only need it for sstables_format (will fix
in one of the next sets).
Also, this set is the prerequisite for controlling the copying of
token_metadata instances (spotted two occurrences in bootstrap
code).
Tests: unit(dev), manual start-stop
"
* 'br-token-metadata-standalone-2' of https://github.com/xemul/scylla:
api: Keep and use reference on token_metadata
redis: Use proxy token_metadata
gossiper: Keep needed for failure_detection values on board
database: Use own token_metadata
batchlog: Use token_metadata from proxy
proxy: Use own token_metadata
gossiper: Use own token_metadata
tokens: Switch into standalone sharded instance
batchlog: Use in-config ring-delay
database: Have it in size_estimate_virtual_reader
storage_proxy: Pass token_metadata in some static helpers
storage_service: Move get_local_tokens wrapper
size_estimates_virtual_reader: Make get_local_ranges static
migration_manager: Refactor validation of new/updating ksm
storage_service: Tiny cleanup of excessive self-reference
Allow the thrower to communicate that it doesn't want the compaction to
be retried later. I know, using exceptions for control flow is *very*
bad, but this is the existing mechanism to stop a compaction and I don't
want to invent a new one for this.
Also massage the error messages a bit to take the value of this flag
into consideration.
"
client_state is used simultaneously by many requests running in parallel
while tracing state pointer is per request. Both those facts do not sit
well together and as a result sometimes tracing state is being overwritten
while still been used by active request which may cause incorrect trace
or even a crash.
"
Fixes#5700.
* 'gleb/tracing_fix_v1' of github.com:scylladb/seastar-dev:
client_state: drop the pointer to a tracing state from client_state
transport: pass tracing state explicitly instead of relying on it been in the client_state
alternator: pass tracing state explicitly instead of relying on it been in the client_state
Currently the call chain for a cleanup collection looks like this:
compaction_manager::perform_cleanup()
compaction_manager::rewrite_sstables()
table::cleanup_sstables()
...
`perform_cleanup()` is essentially empty, immediately deferring to
`rewrite_sstables()`. Cleanup related logic is scattered between the
latter two methods on the call chain. These methods however recently
started serving as generic methods for compactions that want to
rewrite each sstable one-by-one, collecting cleanup related ifs in
various places.
The reason is historic, we first had cleanup, then bolted others on top,
trying to share the underlying code as much as possible.
It is time this is cleaned up (pun intended). Make `perform_cleanup()`
the place where all cleanup related logic is, with the rest of the stack
made truly generic.
Instead of the restrictive `cleanup` boolean flag, which allows for choosing
between only two compaction types, use `compaction_options`, which in
addition to allowing any number of compaction types to be selected,
also allows seamlessly passing specific options to them.
Currently the compaction API is quite restrictive. It offers a generic
`compact_sstables()` and `reshard_sstables()` methods. The former is the
one used by all but resharding, however it only really supports two
modes: regular and cleanup. The latter is supported by a semi-hidden
`cleanup` flag in `compaction_description`. Actually there are two more
compaction types already which are piggy-backed on cleanup: upgrade and
scrub. The upper layers distinguish between actual cleanup and "fake"
cleanup by a `is_actual_cleanup` flag. The latter two "fake" cleanup
compactions cannot be distinguished even by the upper layers.
This is terribly confusing and hard to follow, in addition to being
restrictive.
This worked so far, because upgrade is served quite well by the cleanup
compaction type, turning off certain preparations by the above mentioned
`is_actual_cleanup` flag. Scrub is barely implemented and just an
upgrade behind the scenes.
This situation is however preventing really specializing each
compaction. Enter `compaction_options`. This variant in disguise is
designed to allow passing specific option to each compaction type, and
doubles as an enum allowing more than two low level compaction type.
This patch only adds the option class itself, propagating and handling
it will be done by the next patches.
Although we currently do support upgrade compaction, it is piggy-backed
on top of cleanup compaction. This is soon going to change, so in
preparation to that, add an `Upgrade` member to the `compaction_type`
enum.
Currently the cleanup call is short circuited if it is determined that
cleanup is not needed for the sstable to-be-cleaned-up. This is
undesired because actually not just cleanup uses this routine to rewrite
sstables, sstable-upgrade and sstable-scrub also uses it, and they want
to go on with the cleanup compaction sstables even if all data in it
belongs to the current node.
Fix: #5699
Merged pull request https://github.com/scylladb/scylla/pull/5755 from
Avi Kivity:
This series removes some #include dependencies around cql3. It results in
30k line (6.6%) reduction in the preprocessed size of database.i, mainly
due to elimination of boost::regex (which was brought in in turn by
like_matcher). This should result in fewer and faster recompiles.
commits:
tracing: remove #include of modification_statement.hh from table_helper
cql3: selection: remove now-unneeded include of statement_restrictions.hh
cql3: deinline result_set_builder::restrictions_filter constructor
view_info: remove include of select_statement.hh
cql3: selection: remove unnecessary include of selector_factories
cql3: query_processor: reduce #includes
Cells in CDC logs used to be created while completely neglecting
TTLs (the TTLs from `cdc = {...'ttl':600}`). This patch adds TTLs
to all cells; there are no row markers, so wee need not set TTL
there.
Fixes#5688
One of the logging options for Scylla is syslog, this method,
until today wasn't supported in the docker images that are
created with the Dockerfile in the repo.
This commit add rsyslog installation, configuration and
setup for Docker.
Tests: built and ran the docker and validated the existance
of the /dev/log socket.
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Message-Id: <20200210112448.210169-1-eliransin@scylladb.com>
This reverts commit ca55c6c15f.
Triggers the broken promise exception on aborted stop.
If the feature service is stopped without enabling some features,
the later may end up with "broken promise" exception on futures
attached to the _pr promise.
This assert, added by 060e3f8 is supposed to make sure the invariant of
the append() is respected, in order to prevent building an invalid row.
The assert however proved to be too harsh, as it converts any bug
causing out-of-order clustering rows into cluster unavailability.
Downgrade it to on_internal_error(). This will still prevent corrupt
data from spreading in the cluster, without the unavailability caused by
the assert.
Fixes: #5786
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200211083829.915031-1-bdenes@scylladb.com>
Setting TTL = -1 in cdc_options prevents any writes to CDC log.
But enabling CDC and having unwritable log table makes no sense.
Notably, normal writes USING TTL -1 are forbidden. This patch does
the same to TTLs in CDC options.
Fixes#5747
* jul-stas/5747-cdc-disallow-negative-ttl:
tests/cdc: added test for exception when TTL < 0
cdc: disallow negative TTL values in CDC
When registering callbacks for row-level repair verbs the
sched groups is assigned automatically with the help of
messaging_service::scheduling_group_for_verb. Thus the
the lambda will be called in the needed sched group, no
need for manual switch.
This removes the last occurence of global storage_service
usage from row-level repair.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The do_repair_start() emulates db.invoke_on_all and can
re-use the db.local() inside without the need to call for
global storage_service instance.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The caller of repair_writer.create_writer al ready
have the needed reference on database, no need to
get it from global storage_service instance.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This kills the second global reference on storage_service from
batchlog code and breaks the dependency loop between these two.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Way too many places in code needs storage_service just for token_metadata.
These references increase the amount of get(_local)?_storage_service()
calls and create loops in components dependencies. Keep the token_metadata
separately from storage_service and pass instances' references where
needed (for now -- only into the storage_service itself).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This wrapper just makes sure the system_keyspace::get_saved_tokens
reports non empty result. Move them close together.
As a side effect -- get rid of penultimate global storage_service
reference from size_estimates_virtual_reader (the last one will
be removed soon).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This patch introduces following modifications:
Disallows enabling cdc for table X when X_scylla_cdc_log already exists,
Restricts DROP permissions for X_scylla_cdc_log tables,
Restricts ALTER and DROP permissions for cdc_description and cdc_topology_description,
Disallows cdc option when creating materialized views.
Refs #4991.
Tests: unit(dev).
* piodul/4991-permissions-for-cdc-tables:
cdc: disallow CDC options for materialized views
cdc: restrict permissions on cdc_(topology_)description
cdc: restrict permissions on _scylla_cdc_log tables
cdc: refuse to enable cdc when table _scylla_cdc_log exists
we have compaction_manager.compactions metric for the number of active tasks,
but they don't account for tasks blocked waiting for an opportunity to run,
and they're the problematic ones.
Fixes#5254.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200210131929.30981-1-raphaelsc@scylladb.com>
There's the call of the same name in storage_service, so
make this one explicitly static for better readability.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The goal is to have token_metadata reference intide the
keyspace_metadata.validate method. This can be acheived
by doing the validation through the database reference
which is "at hands" in migration_manager.
While at it, merge the validation with exists/not-exists
checks done in the same places.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
client_state is shared between requests and tracing state is per
request. It is not safe to use the former as a container for the later
since a state can be overwritten prematurely by subsequent requests.
Since dpkg does not re-install conffiles when it removed by user,
currently we are missing dependencies.conf and sysconfdir.conf on rollback.
To prevent this, we need to stop running
'rm -rf /etc/systemd/system/scylla-server.service.d/' on 'remove'.
Fixes#5734
Multiple requests can use the same client_state simultaneously, so it is
not safe to use it as a container for a tracing state which is per request.
Currently next request may overwrite tracing state for previous one
causing, in a best case, wrong trace to be taken or crash if overwritten
pointer is freed prematurely.
Fixes#5700
Multiple requests can use the same client_state simultaneously, so it is
not safe to use it as a container for a tracing state which is per
request. This is not yet an issue for the alternator since it creates
new client_state object for each request, but first of all it should not
and second trace state will be dropped from the client_state, by later
patch.
In bash, 'A || B && C' will be problem because when A is true, then it will be
evaluates C, since && and || have the same precedence.
To avoid the issue we need make B && C in one statement.
Fixes#5764
"
There's a lot of code around that needs storage service purely to
get the specific feature value (cluster_supports_<something> calls).
This creates several circular dependencies, e.g. storage_service <->
migration_manager one and database <-> storage_servuce. Also features
sit on storage_service, but register themselfs on the feature_service
and the former subscribes on them back which also looks strange.
I propose to keep all the features on feature_service, this keeps the
latter intependent from other components, makes it possible to break
one of the mentioned circle dependencyand heavily relax the other.
Also the set helps us fighting the globals and, after it, the
feature_service can be safely stopped at the very last moment.
Tests: unit(dev), manual debug build start-stop
"
* 'br-features-to-service-5' of https://github.com/xemul/scylla:
gossiper: Avoid string merge-split for nothing
features: Stop on shutdown
storage_service: Remove helpers
storage_service: Prepare to switch from on-board feature helpers
cql3: Check feature in .validate
database: Use feature service
storage_proxy: Use feature service
migration_manager: Use feature service
start: Pass needed feature as argument into migrate_truncation_records
features: Unfriend storage_service
features: Simplify feature registration
features: Introduce known_feature_set
features: Move disabled features set from storage_service
features: Move schema_features helper
features: Move all features from storage_service to feature_service
storage_service: Use feature_config from _feature_service
features: Add feature_config
storage_service: Kill set_disabled_features
gms: Move features stuff into own .cc file
migration_manager: Move some fns into class
Allows caller to check/wait for a given user keyspace to finish
populating on boot.
Can be called at any time, though if called before population
starts, it will wait until it either starts and we can determine
that the keyspace does not need populating, or population finishes.
tests: unit
Message-Id: <20200203151712.10003-1-calle@scylladb.com>
The disk-error-handler is purely auxiliary thing that helps
propagating IO errors to the rest of the code. It well
deserves not sitting in the root namespace.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200207112443.18475-1-xemul@scylladb.com>
This patch affects the LWT queries with IF conditions of the
following form: `IF col in :value`, i.e. if the parameter
marker is used.
When executing a prepared query with a bound value
of `(None,)` (tuple with null, example for Python driver), it is
serialized not as NULL but as "empty" value (serialization
format differs in each case).
Therefore, Scylla deserializes the parameters in the request as
empty `data_value` instances, which are, in turn, translated
to non-empty `bytes_opt` with empty byte-string value later.
Account for this case too in the CAS condition evaluation code.
Example of a problem this patch aims to fix:
Suppose we have a table `tbl` with a boolean field `test` and
INSERT a row with NULL value for the `test` column.
Then the following update query fails to apply due to the
error in IF condition evaluation code (assume `v=(null)`):
`UPDATE tbl SET test=false WHERE key=0 IF test IN :v`
returns false in `[applied]` column, but is expected to succeed.
Tests: unit(debug, dev), dtest(prepared stmt LWT tests at https://github.com/scylladb/scylla-dtest/pull/1286)
Fixes: #5710
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200205102039.35851-1-pa.solodovnikov@scylladb.com>
query_processor is a central class, so reducing its includes
can reduce dependencies treewite. This patch removes includes
for parsed_statement, cf_statement, and untyped_result_set and
fixes up the rest of the tree to include what it lacks as a result
of these removals.
This patch adds comprehensive tests for KeyConditionExpression, the newer
DynamoDB API syntax for specifying the item range which is requested from
a Query (this syntax replaced the older KeyConditions syntax, which
Alternator already supports).
Before this patch, we had only a small test for KeyConditionExpression
in test_query.py. This patch replaces it by a large number of small
tests, testing the many sub-features of KeyConditionExpression -
its different operators, sort-key types, different failure modes, etc.
As usual, because we haven't yet implemented this feature in Alternator
(see issue #5037), all these tests pass on AWS, but xfail on Alternator.
Despite the new test file containing about 40 small tests, it finishes
very quickly because we use pytest's fixture feature to allow small
read-only tests to perform a query to a partition that is only written
once for many tests. So these small tests become extremely fast, and
there is no downside to having many small tests instead of lumping them
into fewer large tests checking many things.
Refs #5037.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200207134159.3283-1-nyh@scylladb.com>
This patch changes the way TTL is stored in the CDC log table. Instead
of including TTL of cell `X` in the third element of the tuple in column
`_X`, TTL is written to the previously unused column `ttl`. This is done
for cosmetic purposes.
This implementation works under assumption that there will be only one
TTL included in a mutation coming from a CQL write. This might not be
the case when writing a batch that modifies the same row twice, e.g.:
```
BATCH
INSERT INTO ks.t (pk, ck, v1) VALUES (1,2,3) USING TTL 10;
INSERT INTO ks.t (pk, ck, v2) VALUES (1,2,3) USING TTL 20;
END BATCH
```
In this case, this implementation will choose only one TTL value to be
written in the CDC log:
```
... | batch_seq_no | _ck | _pk | _v1 | _v2 | operation | ttl
...-+--------------+-----+-----+--------+--------+-----------+-----
... | 0 | 2 | 1 | (0, 3) | (0, 3) | 1 | 20
```
This behavior might be changed as a part of issue #5719, which considers
splitting a batch write mutation when it contains multiple writes to the
same row.
Refs #5689
Tests: unit(dev)
The view_update_generator acceps (and keeps) database and storage_proxy,
the latter is only needed to initialize the view_updating_consumer which,
in turn, only needs it to get database from (to find column family).
This can be relaxed by providing the database from _generator to _consumer
directly, without using the storage_proxy in between.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200207112427.18419-1-xemul@scylladb.com>
Two places in dht code have token_metadata _value_ arguments, but only read
tokens from them. Optimize it a bit by turning values into const references.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200207112408.18352-1-xemul@scylladb.com>
Truncation time is used on each LWT request now, so reading it from
the table is too heave operation to be on a fast path. It also requires
jumping to a shard that contains corresponding data. This patch caches
the data on the table object of each shard for easy access. The cache is
initialized during boot from system.truncated table and updated on each
truncation operation.
Message-Id: <20200206163838.5220-2-gleb@scylladb.com>
We made --localrpm option to automatically build rpms from sourcecode,
but we actually use the option to produce AMI using prebuilt rpm on our
CI.
To simplified the script, and to prevent accsidently start rpm build
in the script, drop rpm build part.
get_snapshot should use http stream to reduce memory allocation and
stalls.
This patch change the implementation so it would stream each of the
snapshot object instead of creating a single response and return it.
Fixes#5468
Depends on scylladb/seastar#723
On our build system we tries to build relocatable package multiple times on
same revision of the repository, it executes ./SCYLLA-VERSION-GEN for each time.
When the build job invoked at midnight and it did not finished until 12:00AM,
first build and last build has different SCYLLA-RELEASE-FILE, since it contains
current date.
To prevent it, skip updating SCYLLA-*-FILE when git hash unchanged.
Fixesscylladb/scylla-pkg#826
Currently reader_concurrency_semaphore::signal() can fail. This is
dangerous in two ways:
* It is called from constructors, so the exception can bring down the
node. This will convert an `std::bad_alloc` to a crash.
* Reads in the queue will be blocked until they either time-out, or
another `signal()` succeeds.
To solve this, wrap the `reader_permit` constructor, the only code that
can throw, with try-catch and forward the exception to the reader
admission promise. In practice this will result in the flushing of the
reader queue, when we fail to admit a read.
Fixes#5741
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200206154238.707031-1-bdenes@scylladb.com>
This patch is a bag of fixes/cleanups that were omitted from the reader
memory tracking series due to contributor error. It contains the
following changes:
* Get rid of unused `increase()` and `decrease()` methods.
* Make all constructors and assignment operators `noexcept`.
* Make move assignment operator safe w.r.t. self assignment.
* `reset()`: consume the new amount before releasing the old amount,
to prevent a transient window where new readers might be admitted.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200206143007.633069-1-bdenes@scylladb.com>
Most of the time schema does not have to be copied and sometimes it's not even used.
tests: unit(dev)
Closes#5739
* hawk/remove_schema_copies:
multishard_mutation_query_test: stop capturing unused schema
index_reader: avoid copying schema to lambda
Merged patch series from Piotr Sarna:
This series adds a way to confgure alternator write isolation policy
per-table with the use of tags.
Instead of hardcoded LWT_ALWAYS policy, it can now be set by tagging
a table with a tag of the following form:
{
'Key': 'system:write_isolation',
'Value': X
},
where X is one of the following implemented levels:
* 'f' - forbid RMW
* 'a' - always enforce RMW
* 'o' - only RMW writes will go through LWT
* 'u' - unsafe RMW (to be deprecated/eradicated)
By default, if no tag is found, alternator falls back to always applying
LWT to writes.
This series also contains fixes to the tagging interface - some minor
issue came up while implementing write isolation config on top of tags.
test: alternator-test(local,remote)
Piotr Sarna (6):
alternator: return tags for a table via const reference
alternator: fix overwriting tags
alternator: make _write_isolation a protected attribute
alternator: add configuring write isolation policy via tags
alternator-test: add testing different write isolation policies
docs: update alternator on write isolation
alternator-test/test_condition_expression.py | 63 ++++++++++++++
alternator-test/test_tag.py | 25 ++++++
alternator/executor.cc | 89 +++++++++++++-------
docs/alternator/alternator.md | 21 +++--
4 files changed, 162 insertions(+), 36 deletions(-)
Merged pull request https://github.com/scylladb/scylla/pull/5733 from
Piotr Jastrzębski:
In many places we use global_partitioner() to obtain parameters that are
available in config. This PR replaces number of global_partitioner() calls
with equivalent non-global ways.
tests: unit(dev)
* 'reduce_global_usage' of github.com:haaawk/scylla:
storage_service: reduce number of global_partitioner calls
cdc: remove partitioner from db_context
gossiper: stop calling global_partitioner()
system_keyspace: stop calling global_partitioner()
transport/server: stop calling global_partitioner()
thrift: stop calling global_partitioner()
partitioner: move cpu_sharding_algorithm_name to token-sharding.hh
Until now, write isolation policy was hardcoded to always enforcing LWT.
From now on, setting a tag via UpdateTags request or during table
creation will associate a policy with given table.
The tag key is 'system:write_isolation' and its value can be one of:
* 'f' - forbid RMW
* 'a' - always enforce RMW
* 'o' - only RMW writes will go through LWT
* 'u' - unsafe RMW (to be deprecated/eradicated)
Tagging a resource with a tag key that already exists should result
in overwriting the old value. It wasn't the case, so it's now fixed
and an appropriate test is added.
The signature of the helper function is changed, so that it's possible
to acquire a const reference of the tags, instead of being forced
to get a copy of the whole map (potentially large).
Here is a rebase of some of my already-reviewed Alternator patches -
the final piece of the fix to LWT timestamps (in BatchWriteItems),
The "/localnodes" request, and a couple of patches reducing the number
of times that the global storage_proxy is needed.
Also available in a github branch, git@github.com:nyh/scylla.git series1
* nyh/series1:
redis: remove redundant code
storage_proxy: make it into a peering sharded service
alternator: use simpler API for registering Alternator's HTTP URLs
alternator-test: test "/localnodes" feature
alternator: add public API for list of nodes in current DC
alternator: use LWT timestamp - in BatchWriteItems too
Replace global_partitioner().sharding_ignore_msb() call
with config::murmur3_partitioner_ignore_msb_bits()
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Sharding logic has been moved to token-sharding.hh some time ago.
This logic does not depend on partitioner any more so cpu_sharding_algorithm_name
can be safely moved to the header where rest of sharding logic lives.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
We consider globals like service::get_storage_proxy() a bad idea,
and would like to reduce their use as much as possible - and eventually,
eliminate it completely.
One easy case to fix case is when we already have a shard-local proxy,
but now we need the sharded object, to invoke_on() something on it.
In this patch, we turn storage_proxy into a peering_sharded_service.
This means that if you already have a storage_proxy, you can call
its container() function to get the sharded<storage_proxy>, without
needing to call the global service::get_storage_proxy().
We found a few such cases in storage_proxy itself, and in Alternator,
and fixed them to use container() instead of the global function.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
We used the Seastar HTTP server's add() method to register URLs to
serve (so-called "routes"), but as suggested by Amnon, when we have
fixed URLs without parameters being path components, it's simpler
to use the put() method to do the same thing - and also results in
slightly less work at run-time to look up these routes.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This is a partial test for the "/localnodes" request, which is supposed to
return the list of live nodes in this DC. Because of the limitations of our
current alternator-test framework (which should work on any pre-existing
cluster), we don't know what to expects as a reply, but we just verify the
minimum: The request is understood, returns a JSON list, which contains
at least one item.
As "/localnodes" is a Scylla-only feature, this test is skipped when
running with "--aws".
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
If we want to balance the Alternator request load among the different nodes
(Refs #5030), the load balancer - whether it uses HTTP load balancing or
DNS - needs to be able to get an up-to-date list of live nodes to which it
can direct Alternator traffic. This list should include only the live nodes
in the same data center (geographical region) - it is expected that a
separate load balancer will be installed in each data center, and clients
from within this data center will reach this data center's load balancer.
There are multiple APIs in current Scylla to do something similar to what
we need, but as far as I know, none of them is exactly what we need or
convenient for Alternator installations: We don't want the load balancer
to use CQL, and the REST API http://localhost:10000/gossiper/endpoint/live/
doesn't do what we need (it doesn't restrict the list to one data center)
plus it's not open to connections outside the machine.
So in this patch, we implement a new HTTP request on the Alternator port -
"/localnodes", returning a JSON-formatted list of all live nodes in the
contacted node's data center:
$ curl http://localhost:8000/localnodes
["127.0.0.2","127.0.0.1","127.0.0.3"]
Like the existing health check HTTP request, this operation is public and
unauthenticated. We consider the security risk low - it allows an attacker
to enquire the list of Scylla nodes in this DC, but an attacker can achieve
the same thing by just scanning the addresses in this subnet using the health
check request (or even with ordinary DynamoDB API requests).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
A previous patch fixed Alternator's writes to use the timestamp provided
by LWT instead of the current timestamp. That patch fixed the PutItem,
DeleteItem and UpdateItem operations - and this patch fixes the remaining
write operation: BatchWriteItems. So,
Fixes#5653.
Unfortunatly, the requirements of both BatchWriteItems and LWT make the
resulting code - and this patch - somewhat inelegant. BatchWriteItems
requires that we prepare all the operations first - failing if any of them
has an error. Before this patch, the result of this preparation was an
array of mutations, which in a second step we wrote to the database.
But we can no longer use mutations for the result of the first step,
because creating a mutation requires knowing the timestamp, which we don't
know during the preparate phase - we will only know it during the later
LWT operation. So now we need to invent a new intermediate format between
the request and the mutation. This intermediate format is further
complicated by the need to be send it between shards (for LWT's shard
forwarding) so it cannot, for example, contain a reference to a schema.
The fact that different sub-operations need to be sent to different shards,
and that different sub-operations may write to different tables, further
complicate the book-keeping and gives us a bunch of funky-typed maps.
But eventually it all fits together.
After this patch, as before this patch, the same code (now called
put_or_delete_item), is used to implement both the PutItem and DeleteItem
stand-alone operation, and the BachWriteItems operation which includes a
whole list of these PutItem and DeleteItem operation.
This patch also includes two more tests in test_batch.py, which test two
more corner tests we haven't tested before: One tests the capability of
BatchWriteItems to write to more than one table. The other tests that
BatchWriteItems can write an empty item (it is not surprising that it does,
but we do have special code for this case, so we should test it).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
"
This series fixes an assertion when initialization fails after
creating a database. I don't know of a case where that currently
happens, but it is easy to cause that when writing a patch and the
produced assert is just confusing.
"
* 'espindola/dont-assert-on-init-error' of https://github.com/espindola/scylla:
db: Replace large_data_handler::_stopped with _running
db: Move nop_large_data_handler constructor out-of-line
db: Move large_data_handler::stop out-of-line
This tests fails when run on more than 1 core.
Tests: unit(dev)
* hawk/fix_memory_footprint:
memory_footprint_test: Make it clear it has to run with -c1
tests: move memory_footprint_test to perf/
"
After deprecating partitioners other than Murmur3 we can change the representation of tokens to int64_t. This will allow setting custom partitioner on each table. With this change partitioners become just converters from partition keys to tokens (int64_t). Following operations are no longer dependant on partitioner implementation:
- Tokens comparison
- Tokens serialization/deserialization to strings
- Tokens serialization/deserialization to bytes
- Sharding logic
- Random token generation
This change will be followed by a PR that enables per table partitioner and then another PR that introduces a special partitioner for CDC tables.
Tests: unit(dev)
Results of memory footprint test:
Differences:
in cache: 992 vs 984
in memtable: 750 vs 742
sizeof(cache_entry) = 112 vs 104
-- sizeof(decorated_key) = 36 vs 32
MASTER:
mutation footprint:
in cache: 992
in memtable: 750
in sstable: 351
frozen: 540
canonical: 827
query result: 342
sizeof(cache_entry) = 112
-- sizeof(decorated_key) = 36
-- sizeof(cache_link_type) = 32
-- sizeof(mutation_partition) = 96
-- -- sizeof(_static_row) = 8
-- -- sizeof(_rows) = 24
-- -- sizeof(_row_tombstones) = 40
sizeof(rows_entry) = 232
sizeof(lru_link_type) = 16
sizeof(deletable_row) = 168
sizeof(row) = 112
sizeof(atomic_cell_or_collection) = 8
THIS PATCHSET:
mutation footprint:
in cache: 984
in memtable: 742
in sstable: 351
frozen: 540
canonical: 827
query result: 342
sizeof(cache_entry) = 104
-- sizeof(decorated_key) = 32
-- sizeof(cache_link_type) = 32
-- sizeof(mutation_partition) = 96
-- -- sizeof(_static_row) = 8
-- -- sizeof(_rows) = 24
-- -- sizeof(_row_tombstones) = 40
sizeof(rows_entry) = 232
sizeof(lru_link_type) = 16
sizeof(deletable_row) = 168
sizeof(row) = 112
sizeof(atomic_cell_or_collection) = 8
"
* 'fixed_token_representation' of https://github.com/haaawk/scylla: (21 commits)
token: cast to int64_t not long in long_token
murmur3: move sharding logic to token and i_partitioner
partitioner: move shard_of_minimum_token to token
partitioner: remove token_to_bytes
partitioner: move get_token_validator to token
partitioner: merge tri_compare into dht::tri_compare
partitioner: move describe_ownership to token
partitioner: move from_bytes to token
partitioner: move from_string to token
partitioner: move to_sstring to token
partitioner: move get_random_token to token
partitioner: move midpoint function to token
token: remove token_view
sstables: use copy constructor for tokens
token: change _data to int64_t
partitioner: remove hash_large_token
token: change data to array<uint8_t, 8>
partitioner: Extract token to separate .hh and .cc files
partitioner: remove unused functions
Revert "dht/murmur3_partitioner: take private methods out of the class"
...
Since token representation is fixed now, all the partitioners
will share the sharding logic. It makes sense now to keep
the logic in common super class and separate header that's
included only in i_partitioner.cc.
shard_of and token_for_next_shard are now implemented in
i_partitioner. They would be non-virtual but we have to
keep them virtual because one test is overriding them
to enforce some specific sharding.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
i_partitioner::token_to_bytes is just a call to
token::data and does not depend on partitioner
at all. It is possible to convert token to bytes
without having access to partitioner.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Previously _data was stored as array of 8 bytes in
network byte order.
After this change it stores the same value in int64_t
in host byte order.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Now that token representation is always array<uint8_t, 8>,
hash<dht::token> will always pick
read_le<size_t>(reinterpret_cast<const char*>(b.data()))
and never call hash_large_token because the check
is always true b.size() == sizeof(size_t).
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
It is save to do such change because we support only
Murmur3Partitioner which uses only tokens that are
8 bytes long.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This patch conflicts with the following patches.
The final effect is equivalent and it's easier to revert this patch
and cleanly apply already reviewed patches.
This reverts commit f4f8593bac.
This is not just a direct flip to a variable with the negated Boolean
value. When created, a large_data_handler is not considered to be
running, the user has to call start() before it can be used.
The advantaged of doing this is that if initialization fails and a
database is destructed before the large_data_handler is started, the
assert
database::stop() {
assert(!_large_data_handler->running());
is not triggered.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
"
This patch series fixes#5711, enables UDF support in CQL tests and
and includes a few extra cleanups.
"
* 'espindola/lua-fixes' of https://github.com/espindola/scylla:
lua: Use a negative index for consistency
lua: Fix returning list<decimal>
lua: Fix returning list<varint>
lua: Use a lua_slice_state instead of a from_lua_visitor
test: Enable UDF in the cql repl
The DynamoDB API doesn't have the notion of client-supplied timestamps,
so the server is supposed to use its own current timestamp for write
operations.
However, for LWT writes, we should not use this node's current time:
Different nodes may slightly differ in their clocks, and LWT needs
a monotonically-increasing notion of time for the consistent operations.
LWT provides to the operation's apply() method the specific timestamp that
it should use in its returned mutation - and we should use this timestamp,
not the current timestamp.
In the optional write modes where LWT is not used, we continue to use
the current timestamp (api::new_timestamp()) as before.
This patch fixes the PutItem, UpdateItem and DeleteItem operations.
The BatchWriteItem operation is not yet fixed by this patch - fixing
it will require more elaborate code changes so will be done in a
separate patch.
Refs #5653.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200130122853.7658-1-nyh@scylladb.com>
Today, we use LWT for all PutItem, UpdateItem and GetItem operations.
We do this even for pure writes - writes which do not involve a read
before the write).
But BatchWriteItem also does pure writes - and it doesn't use LWT yet.
So this patch changes it so it does. As before we keep in the code -
not yet configurable by a user - also the option to do these unconditional
writes without LWT.
A BatchWriteItem may change multiple partitions (but a fairly low number -
DynamoDB allows each BatchWriteItem to only do 25 updates) and we start the
different LWT operations in parallel.
This patch collects multiple mutations to the same partition together to
be done with a single LWT operation, so we also add a test for this case,
were we have a batch of writes involving several items in each of several
partitions.
Fixes#5637
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200128160538.11775-1-nyh@scylladb.com>
This produce code that is just as fast as the previous implementation
and is quite a bit easier to read IMHO.
I benchmarked it by temporally adding:
BOOST_AUTO_TEST_CASE(bench_maybe_quote) {
std::string val(1 << 20, 'x');
using clk = std::chrono::steady_clock;
cql3::util::maybe_quote(val);
auto start = clk::now();
for (int i = 0; i < 1000; ++i) {
cql3::util::maybe_quote(val);
}
auto end = clk::now();
std::chrono::duration<double> duration = end - start;
std::cout << "delta = " << duration.count() << '\n';
}
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200203225140.180262-1-espindola@scylladb.com>
* seastar 65980a9b30...30185fd901 (12):
> sstring: resize: NulTerminate when downsizing
> reactor: make open_flags::dsync respect --unsafe-bypass-fsync
> json/json_elements: Use double quotes around element name
> Revert "reactor: make open_flags::dsync respect --unsafe-bypass-fsync"
> Merge "smp: reduce allocations in work_item::process" from Avi
> task: optimize destruction by making destructor non-virtual
> reactor: make open_flags::dsync respect --unsafe-bypass-fsync
> Revert "sstring: resize: NulTerminate when downsizing"
> sstring: resize: NulTerminate when downsizing
> tests: Rename unix domain socket test for consistency
> resource: downgrade cgroupsv2 message.
> Merge "Simplify the stream/subscription implementation" from Rafael
Merged pull request https://github.com/scylladb/scylla/pull/5485
by Kamil Braun:
This series introduces the notion of CDC generations: sets of CDC streams
used by the cluster to choose partition keys for CDC log writes.
Each CDC generation begins operating at a specific time point, called the
generation's timestamp (cdc_streams_timestamp in the code).
It continues being used by all nodes in the cluster to generate log writes
until superseded by a new generation.
Generations are chosen so that CDC log writes are colocated with their
corresponding base table writes, i.e. their partition keys (which are CDC
stream identifiers picked from the generation operating at time of making
the write) fall into the same vnode and shard as the corresponding base
table write partition keys. Currently this is probabilistic and not 100%
of log writes will be colocated - this will change in future commits,
after per-table partitioners are implemented.
CDC generations are a global property of the cluster -- they don't depend
on any particular table's configuration. Therefore the old "CDC stream
description tables", which were specific to each CDC-enabled table,
were removed and replaced by a new, global description table inside the
system_distributed keyspace.
A new generation is introduced and supersedes the previous one whenever
we insert new tokens into the token ring, which breaks the colocation
property of the previous generation. The new generation is chosen to
account for the new tokens and restore colocation. This happens when a
new node joins the cluster.
The joining node is responsible for creating and informing other nodes
about the new CDC generation. It does that by serializing it and inserting
into an internal distributed table ("CDC topology description table").
If it fails the insert, it fails the joining process. It then announces
the generation to other nodes through gossip using the generation's
timestamp, which is the partition key of the inserted distributed table
entry.
Nodes that learn about the new generation through gossip attempt to
retrieve it from the distributed table. This might fail - for example,
if the node is partitioned away from all replicas that hold this
generation's table entry. In that case the node might stop accepting
writes, since it knows that it should send log entries to a new generation
of streams, but it doesn't know what the generation is. The node will keep
trying to retrieve the data in the background until it succeeds or sees
that it is no longer necessary (e.g., because yet another generation
superseded this one). So we give up some availability to achieve safety.
However, this solution is not completely safe (might break consistency
properties): if a node learns about a new generation too late (if gossip
doesn't reach this node in time), the node might send writes to the wrong
(old) generation. In the future we will introduce a transaction-based
approach where we will always make sure that all nodes receive the new
generation before any of them starts using it (and if it's impossible
e.g. due to a network partition, we will fail the bootstrap attempt).
In practice, if the admin makes sure that the cluster works correctly
before bootstrapping a new node, and a network partition doesn't start
in the few seconds window where a new generation is announced, everything
will work as it should.
After the learning node retrieves the generation, it inserts it into an
in-memory data structure called "CDC metadata". This structure is then
used when performing writes to the CDC log -- given the timestamp of the
written mutation, the data structure will return the CDC generation
operating at this time point. CDC metadata might reject the query for
two reasons: if the timestamp belongs to an earlier generation, which
most probably doesn't have the colocation property anymore, or if it is
picked too far away into the future, where we don't know if the current
generation won't be superseded by a different one (so we don't yet know
the set of streams that this log write should be sent to). If the client
uses server-generated timestamps, the query will never be rejected.
Clients can also use client-generated timestamps, but they must make sure
that their clocks are not too desynchronized with the database --
otherwise some or all of their writes to CDC-enabled tables will be
rejected.
In the case of rolling upgrade, where we restart nodes that were
previously running without CDC, we act a bit differently - there is no
naturally selected joining node which must propose a new generation.
We have to select such a node using other means. For this we use a bully
approach: every node compares its host id with host ids of other nodes
and if it finds that it has the greatest host id, it becomes responsible
for creating the first generation.
This change also fixes the way of choosing values of the "time" column
of CDC log writes: the timeuuid is chosen in a way which preserves
ordering of corresponding base table mutations (the timestamp of this
timeuuid is equal to the base table mutation timestamp).
Warning: if you were running a previous CDC version (without topology
change support), make sure to disable CDC on all tables before performing
the upgrade. This will drop the log data -- backup it if needed.
TODO in future patchset: expire CDC generations. Currently, each inserted
CDC generation will stay in the distributed tables forever (until
manually removed by the administrator). When a generation is superseded,
it should become "expired", and 24 hours after expiration, it should be
removed. The distributed tables (cdc_topology_description and
cdc_description) both have an "expired" column which can be used for
this purpose.
Unit tests: dev, debug, release
dtests (dev): https://jenkins.scylladb.com/job/scylla-master/job/byo/job/byo_build_tests_dtest/907/
Now that we bounce lwt requests to the correct shard before calling into
storage_proxy the cross shard op accounting does not account for bounced
lwt statement. Fix that by increasing corresponding counter when
returning a "bounce" reply.
Message-Id: <20200203122011.GH26048@scylladb.com>
Merged patch series from Benny Halevy:
The function was reimplemented to solve the following issues.
The cutom implementation also improved its performance in
close to 19%
Using regex_match("[a-z][a-z0-9_]*") may cause stack overflow on long input strings
as found with the limits_test.py:TestLimits.max_key_length_test dtest.
std::regex_replace does not replace in-place so no doubling of
quotes was actually done.
Add unit test that reproduces the crash without this fix
and tests various string patterns for correctness.
Note that defining the regex with std::regex::optimize
still ended up with stack overflow.
Fixes#5671
* cql3::util::maybe_quote: avoid stack overflow and fix quote doubling
* cql3::util::maybe_quote: further optimize quote doubling
Merged patch series from Gleb Natapov:
Batch statement can also execute LWT and hence need to handle
bounce_to_shard result.
* transport: handle bounce_to_shard for batch statement
* transport: consolidate bounce_to_shard handling between all three verbs that handle it
Command line options are printed out, so if a user cuts-and-pastes a
command line they will get a run that is more similar to the one that
the test executed.
Message-Id: <20200202133209.209608-1-avi@scylladb.com>
In this case we know the size of the stack and both indexes refer to
the same position. Using a negative index is just more consistent with
the rest of the file and hopefully a bit less brittle to future
changes.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
We were accessing the wrong stack location if a decimal was not at top
of the stack.
Fixes: #5711
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
We were accessing the wrong stack location if a varint was not at the
top of the stack.
Refs: #5711
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
A few places were using a from_lua_visitor only to access the
lua_slice_state member variable.
This is just a simplification. No functionality changed.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
All three verbs that need to handle bounce_to_shard have almost
identical process_*() and process_*_on_shard() functions. Consolidate
them into one to reuse the code.
The caller of check_knows_remote_features merges a set of
features into a string, but the method in question ... splits
them back into the set. Avoid this unneeded step and clean
the respective storage service helpers.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
There are some places that get global storage_service instance
for individual features. In the next patch all these helpers
will be removed, so here's the preparation for it.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
There's no local variable to get features from in the
create_view_statement constructor, but since the .validate
is always called after it, it looks safe to check for
needed feature in it (we have storage_proxy there).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Keep local feature_service reference on database. This relaxes the
circular storage_service <-> database reference, but not removes it
completely.
This needs some args tossing in apply_to_builder, but it's
rather straightforward, so comes in the same patch.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Keep reference on local feature service from storage_proxy
and use it in places that have (local) storage_proxy at hands.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This unties migration_manager from storage_service thus breaking
the circular dependency between these two.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The storage service no longer needs to mess with feature
config. It only needs two features to register onself in,
but this can be solved by respective cluster_supports_foo
helpers.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Now features are registered into a map of vectors, but
it looks like the vector is always 1-item long and is
used to keep pointer on feature, instead of the feature
itself.
Switch it into map of reference_wrapper-s.
Before this patch we could register more than one
feature under the same name, now we can't. But this
seems to be OK, as we don't actually do this. To catch
violations of this restriction there's an assert() in the
feature_service::register_feature.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
There are two masks -- supported and known. They differ in
unbounded_range_tombstones one which is set depending on the
sstables format in use.
Since the feature_service doesn't know anything about sstables
format, the logic is reverted -- the feature service reports
back the known mask (all features) and storage_service clears
the unbounded_range_tombstones if the sst format is low -- but
is (hopefully) left intact.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
And leave some temporary storage_service->feature links. The plan
is to make every subsystem that needs storage_service for features
stop doing so and switch on the feature_service.
The feature_service is the service w/o any dependencies, it will be
freed last, thus making the service dependency tree be a tree,
not a graph with loops.
While at it -- make all const-s not have _FEATURE suffix (now there
are both options) and const-qualify cluster_supports_lwt().
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Some features take db::config to find out whether to be enabled
or disabled. This creates unwanted dependency between database and
features, so split the features configuration explicitly. Also
this will make the "this is for testing env only" logic cleaner
and simpler to understand.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The _disabled_features is configured by tests via storage_service
constructor, so the helper in question is effectively useless.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
If we are a seed node (but not the only one) or we set
auto_bootstrap=off, it might happen due to misconfiguration or a network
partition that we don't know other nodes' tokens at the end of the
join_token_ring function, when we go into the NORMAL status, finishing
the joining process.
CDC however requires that we know other nodes' tokens at this point:
we need them to correctly create a new CDC generation.
This commit adds a check which prevents the node from starting if that's
not the case. If the check fails, the node first tries waiting a bit until
it learns about the tokens or timeouts.
"
The fix itself is fairly simple, but looking at the code I found that
our code base was not cleanly distinguishing null and empty values and
was treating null and missing values differently, but that distinction
was dead since a null is represented as a dead cell.
"
* 'espindola/lua-fix-null-v6' of https://github.com/espindola/scylla:
lua: Handle nil returns correctly
types: Return bytes_opt from data_value::serialize
query-result-set: Assert that we don't have null values
types: Fix comparison of empty and null data_values
Revert "tests: Handle null and not present values differently"
query-result-set: Avoid a copy during construction
types: Move operator== for data_value out-of-line
"
In a few places, the only use we had for a subscription was calling
done(). With this series we now call done() early and store the
future<> instead.
"
* 'espindola/stream-cleanup' of https://github.com/espindola/scylla:
sstable_test: Store a future<> instead of a subscription
commitlog: Store a future instead of a subscription in db::commitlog::segment_manager::list_descriptors::helper
lister: Store a future<> instead of a subscription
With this change we always rebuild seastar/libseastar_testing.a for
the same reason we always rebuild seastar/libseastar.a: We have no
idea what its dependencies are, we have to recurse to seastar to find
out.
The other missing dependency is that we have to rebuild build.ninja
when seastar/CMakeLists.txt changes. A change in
seastar/CMakeLists.txt can cause seastar.pc to change which can change
the command lines used.
That is incomplete as change other seastar files can have the same
impact, but it is better than nothing.
It is not sufficient to put a dependency in the seastar.pc file as
that file will be modified when cmake is run and the scylla ninja
process doesn't see the CMakeLists.txt to seastar.pc edge.
Fixes: #5687
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200201001126.458992-1-espindola@scylladb.com>
This unregistration doesn't happen currently, but doesn't seem to
cause any problems in general, as on stop gossiper is stopped and
nothing from it hits the store_service.
However (!) if an exception pops up between the storage_service
is subscribed on gossiper and the drain_on_shutdown defer action
is set up then we _may_ get into the following situation:
- main's stuff gets unrolled back
- gossiper is not stopped (drain_on_shutdown defer is not set up)
- migration manager is stopped (with deferred action in main)
- a nitification comes from gossiper
-> storage_service::on_change might want to pull schema with
the help of local migration manager
-> assert(local_is_initialized) strikes
Fix this by registering storage_service to gossiper a bit earlier
(both are already initialized y that time) and setting up unregister
defer right afterwards.
Test: unit(dev), manual start-stop
Bug: #5628
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200130190343.25656-1-xemul@scylladb.com>
We use eventually() in tests to wait for eventually consistent data
to become consistent. However, we see spurious failures indicating
that we wait too little.
Increasing the timeout has a negative side effect in that tests that
fail will now take longer to do so. However, this negative side effect
is negligible to false-positive failures, since they throw away large
test efforts and sometimes require a person to investigate the problem,
only to conclude it is a false positive.
This patch therefore makes eventually() more patient, by a factor of
32.
Fixes#4707.
Message-Id: <20200130162745.45569-1-avi@scylladb.com>
No-one seems to invoke this method. Instead, clients invoke
restriction::values (note singular "restriction"). Most subclasses of
restrictions also inherit from restriction, so values() still exists
in their public interface.
Tests: unit (dev)
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
"
Benny pointed out that we could avoid a branch inside a loop is the
old serialization code. That got me looking at the logic and I found
that it would also produce an unnecessary 0xff prefix for some
negative numbers.
This patch series fixes the serialization and optimizes it. It now
does no extra copies for positives numbers and only one extra copy for
negative numbers, which I think is optimal since cpp_int uses sign
magnitude and we want the 2 complement representation.
"
* 'espindola/serialize_varint-improvements-v2' of https://github.com/espindola/scylla:
types: Use a fancy iterator to avoid a temporary buffer
types: Use export_bits to serialize cpp_int
types: Avoid a branch in a loop
types: Fix encoding of negative varint
types: Replace "num.sign() < 0" with "num < 0"
By using a fancy iterator we can avoid calling export_bits with a
temporary buffer before copying the result to the output.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
We would sometimes produce an unnecessary extra 0xff prefix byte.
The new encoding matches what cassandra does.
This was both a efficiency and correctness issue, as using varint in a
key could produce different tokens.
Fixes#5656
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The only use we had for the subscription was calling done, may as well
call it early and store the future<>.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The only use we had for the subscription was calling done, may as well
call it early and store the future<>.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The only use we had for the subscription was calling done, may as well
call it early and store the future<>.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Treat writes to local.paxos as user memory, as the number of writes is
dependent on the amount of user data written with LWT.
Fixes#5682
Message-Id: <20200130150048.GW26048@scylladb.com>
This set implements support for per scheduling group statistics in
storage proxy and tables view statistics (although tables view per
scheduling group stats are not actively applied in this series).
Having those statistics per scheduling group can help in finding operations
that are performed outside their context, another advantage is that
it lays the land for supporting per service level statistics for the
workload prioritization enterprise feature.
At some point there was a thought to add those stats per role but
for now it is not feasible at the moment:
1. The number of roles/user is unbounded so it is dangerous to
hold stats (in memory) for all of them.
2. We will need a proper design of how to deal with the hierarchical
nature of roles in the stats.
Besides these reasons and regardless, it is beneficial to look on
resource related stats per scheduling group, looking at resources
per user or role will not necessarily give insights since resources
are divided per sg and not role, so it can lead to false conclusions
if more than one role is attached to the same service level.
Tests:
unit tests (Dev, Debug)
validating the stats with monitor
* es/per_sg_stats/v6:
storage proxy: migrate to per scheduling group statistics
internalize storage proxy statistics metric registration
This commit builds on top of the introduced per scheduling group
statistics template and employs it for achieving a per scheduling
group statistics in storage_proxy.
Some of the statistics also had meaning as a global - per
shard one. Those are the ones for determining if to
throttle the write request. This was handled by creating a
global stats struct that will hold those stats and by changing
the stat update to also include the global one.
One point that complicated it is an already existing aggregation
over the per shard stats that now became a per scheduling group
per shard stats, converting the aggregation to a two-dimensional
aggregation.
One thing this commit doesn't handle is validating that an individual
statistic didn't "cross a scheduling group boundary", such validation
is possible but it can easily be added in the future. There is a
subtlety to doing so since if the operation did cross to other
scheduling group two connected statistics can lose balance
for example written bytes and completed write transactions.
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
The storage proxy statistics structure did not contain
a method for registering the statistics for metric
groups, instead, each user had to register some
of the metrics by itself. There is no real reason
for separating the metrics registration from
the statistics data. There is even less justification
for doing this only for part of the stats as is
the case for those statistics.
This commit internalize the metrics registration
in the storage_proxy stats structures.
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Avoid string copies when doubling quotes in the string
by counting them when scanning the input string and
reserving the required space when making the result std::string.
This showed a performance improvement of ~1.8% when
running the maybe_quote unit test in tight loop
(w/ the shorter strings only)
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
gcc 10 tightened its C++ includes to no longer provide ssize_t,
so we must get it from a C header instead.
Message-Id: <20200129205912.21139-1-avi@scylladb.com>
gcc 10 requires a semicolon after every compound requirement,
as per the standard. Add missing semicolons where necessary.
Message-Id: <20200129205805.20928-1-avi@scylladb.com>
To increase modularity, making it easier to find what is where and
maintain.
The 'log' module (cdc/log.{hh,cc}) is responsible for updating CDC log
tables when base table writes are performed.
The 'generation' module (cdc/generation.{hh,cc}) handles stream
generation changes in response to topology change events.
cdc/metadata.{hh,cc} contains a helper class which holds the currently
used generation of streams. It is used by both aforementioned modules:
'log' queries it, while 'generation' updates it.
Snitch forms a class hierarchy which get_shard_count and
get_ignore_msb_bits ignore (their returned values only depend on the
gossiper's state).
Besides, these functions just don't belong there.
Snitch has nothing to do with shard_count or ignore_msb_bits.
Change the CDC code to use the global CDC stream generations.
The per-base-table CDC description table was removed. The code instead
uses cdc::metadata which is updated on gossip events.
The per-table description tables were replaced by a global description
table to be used by clients when searching for streams.
When a node learns that another node joins the cluster (or begins
the joining process, i.e. bootstrap), it will read the CDC generation
timestamp proposed by that node, use it to retrieve the generation from the
distributed generations table, and save it in its local generation queue
to be used for writing to the CDC log when its local clock crosses
the generation's timestamp.
The CDC generation is saved in the queue before tokens are saved in
token_metadata. This is important so that when the node becomes
a coordinator of a write, it will already have all the necessary
information required to generate a corresponding CDC log mutation.
After joining, nodes should keep gossiping their proposed stream
generation timestamps forever, until they learn about a newer timestamp,
in which case they'll start gossiping the new timestamp.
There is one case where a node won't gossip such any generation timestamp:
if it's upgrading from a non-CDC version.
In this situation we make one of the nodes begin the first generation.
The class stores a queue of CDC generations to be used for choosing
streams when writing to the CDC log.
This data structure will be updated on some gossip events (when a new node
joins the cluster and proposes a new generation of CDC streams).
In future commits this will be used by nodes learning about other nodes
entering NORMAL status. The joining node proposes a new generation of streams,
whose timestamp is gossiped by the node.
Generate a new generation of streams during bootstrap,
insert it into an internal distributed table for other nodes to read
and save its timestamp in the system.local table.
When restarting, read the generation timestamp from the system.local table.
Gossip the generation timestamp.
This will be used to persist CDC streams generation timestamp
proposed by a joining node in case the node crashes or restarts,
similarly to the way tokens are persisted.
The get_saved_cdc_streams_timestamp method retrieves the generation
timestamp from the system table. It will be used by a restarting
node.
The update_cdc_streams_timestamp method saves CDC stream
generation timestamp of the calling node in the system table.
A joining node will persist the timestamp before it proposes it to other
nodes.
The cdc_topology_description table will be used internally
by nodes to send new CDC stream generations to other nodes.
The cdc_description table is a user-facing table,
used to inform users about new sets of CDC streams.
Regenerate sstables and digests for schema_change_test.
We don't need to protect this change by a schema feature:
when a node creates these tables, it announces them
to all other nodes. If schema agreement happens before
this migration, all nodes will use a digest calculated
without these tables. If it happens after, then all nodes
will eventually know about these tables and use a digest
calculated with these tables.
cdc::topology_description describes a mapping of tokens to CDC streams.
The cdc::generate_topology_description function is given:
1. a set of tokens which split the token ring into token ranges (vnodes),
2. information on how each token range is distributed among its owning
node's shards
and tries to generate a set of CDC stream identifiers such that for each
shard and vnode pair there exists a stream whose token falls into this
vnode and is owned by this shard.
It then builds a cdc::topology_description which maps tokens to these
found stream identifiers, such that if token T is owned by shard S in
vnode V, it gets mapped to the stream identifier generated for (S, V).
This is a class that will be used for storing information
required to perform CDC operations, i.e. assignment of token ranges to
CDC streams.
It is serializable to bytes and will be stored
in such a form in a distributed table accessible
by all nodes.
Allows calculating the shard of the given token using custom values of
shard_count and sharding_ignore_msb (instead of the ones used by the
particular partitioner instance).
The function was reimplemented to solve the following issues.
The cutom implementation also improved its performance in
close to 19%
Using regex_match("[a-z][a-z0-9_]*") may cause stack overflow on long input strings
as found with the limits_test.py:TestLimits.max_key_length_test dtest.
std::regex_replace does not replace in-place so no doubling of
quotes was actually done.
Add unit test that reproduces the crash without this fix
and tests various string patterns for correctness.
Note that defining the regex with std::regex::optimize
still ended up with stack overflow.
Fixes#5671
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
test.py returns -1 on failure; exit() translates that to 255, which git
bisect interprets as a special exit code requiring manual intervention.
Change to return the more traditional 1 on failure, which git bisect
can interpret as a normal failure condition.
Message-Id: <20200130084950.4186598-1-avi@scylladb.com>
With this patch lua nil values are mapped to CQL null values instead
of producing an error.
Fixes#5667
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Since a data_value can contain a null value, returning bytes from
serialize() was losing information as it was mapping null to empty.
This also introduces a serialize_nonnull that still returns bytes, but
results in an internal error if called with a null value.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Null values are represented with dead cells and never included in a
result_set. To enforce that, this adds a non_null_data_value that
wraps a data_value and whose constructor calls on_internal_error if a
null data_value is passed.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Before this patch a null data_value would compare equal to any
data_value that serialized to an empty byte sequence.
With this patch null only compares equal to null.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This reverts commit 2ebd1463b2.
The test introduced by that commit was wrong, and in fact depended on
a bug in operator== for data_value. A followup patch fixes operator==,
so this reverts the broken commit first.
The reason it was broken was that it created a live cell with a null
data_value. In reality, null values are represented with dead cells.
For example, the sstable produced by
CREATE TABLE my_table (key int PRIMARY KEY, v1 int, v2 int) with compression = {'sstable_compression': ''};
INSERT INTO my_table (key, v1, v2) VALUES (1, 42, null);
Is
00 04 key_length
00 00 00 01 key
7f ff ff ff local_deletion_time
80 00 00 00 00 00 00 00 marked_for_delete_at
24 HAS_ALL_COLUMNS | HAS_TIMESTAMP
09 row_body_size
12 prev_unfiltered_size
00 delta_timestamp
08 USE_ROW_TIMESTAMP_MASK
00 00 00 2a value
0d USE_ROW_TIMESTAMP_MASK | HAS_EMPTY_VALUE_MASK | IS_DELETED_MASK
00 deletion time
01 END_OF_PARTITION
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Merged patch series from Piotr Sarna:
This series adds the following to alternator:
- TagResource request
- UntagResource request
- ListTagsOfResource request
- Honoring "Tags" parameter in CreateTable
It also provides more tests for above features and extended docs.
Tagging is backed by a schema extension, which is in turn backed
by entries in system_schema.tables.extensions map.
Tags are considered part of the schema, and in particular
they are updated via an equivalent of:
ALTER TABLE table WITH scylla_tags = {'key1':'v1', 'key2':'v2'}
Each tag change is therefore a schema change, which also means
that editing tags for the same table on different nodes may be
subject to races, until the schema agreement issues are resolved
in Scylla.
Fixes#5066
Tests: alternator-test(local, remote)
Piotr Sarna (6):
alternator,main: add tags schema extension
alternator: add creating values from string views
alternator: implement tagging
alternator: allow tagging on table creation
docs: add entries for alternator tags and arn
alternator-test: make test tables case sensitive
alternator-test/test_tag.py | 63 ++++++++++-
alternator-test/util.py | 2 +-
alternator/executor.cc | 191 ++++++++++++++++++++++++++++++++--
alternator/executor.hh | 3 +
alternator/rjson.cc | 4 +
alternator/rjson.hh | 1 +
alternator/server.cc | 3 +
alternator/tags_extension.hh | 52 +++++++++
docs/alternator/alternator.md | 14 ++-
main.cc | 5 +
10 files changed, 325 insertions(+), 13 deletions(-)
create mode 100644 alternator/tags_extension.hh
Attempting to apply timed-out writes is a wasted effort. The coordinator
have already given up on the write and reported it as failed to the
client. Any cycles spent on this write is a waste at this point.
We currently only check the timeout if the write is blocked on memory,
otherwise, if the system is not under pressure, we will happily apply
timed out writes. If the system is under pressure we will make it worse
by wasting cycles on processing a timed out write.
Prevent this by checking the timeout as early as possible in
`database::apply()` and `database::apply_counter_update()`.
This patch doesn't solve all our problems related to timed out writes.
They can still sit and accumulate in various queues without expiring, a
prominent example being the smp queues. It is however a good first step
towards reducing wasted effort spent on them.
Refs: #5055
Ref #5251
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200129093007.550250-1-bdenes@scylladb.com>
After 546556b71b we can have mixed writes into commitlog,
some do flush immediately some do not. If non flushing write races with
flushing one and becomes responsible for writing back its buffer into a
file flush will be skipped which will cause assert in batch_cycle() to
trigger since flush position will not be advanced. Fix that by checking
that flush was skipped and in this case flush explicitly our file
position.
Fixes#5670
Message-Id: <20200128145103.GI26048@scylladb.com>
During table creation, it's now possible to provide a 'Tags' parameter,
which will add tags to a newly created table. Note that creating a table
and tagging it is not atomic, so in case of failure it's possible to end
up with a created table, but without appropriate tags.
This commit comes with a test.
Message-Id: <00c2e202e9075d2c61e4ee5ba322ff4d5dbe718c.1579618972.git.sarna@scylladb.com>
A schema extension is introduced for alternator - tags.
This schema extension can be used to store arbitrary tags for a table,
in the form of a map<text, text>.
Updating tags for a table is equivalent to the following CQL query:
ALTER TABLE table WITH scylla_tags = {'key1':'v1', 'key2':'v2'}
The extension, as all other extensions, is backed by the entry
in the system_schema.tables table.
Add "const" attributes to `assignment_testable::test_assignment`
and `term::raw::prepare` methods. These should have been marked as
"const" even before the change but for some reason were missing
these qualifiers.
Mark other supplementary methods with "const" attributes as
necessary.
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200127213215.494000-1-pa.solodovnikov@scylladb.com>
"
Throw an error in case we hit an invalid time UUID
rather than hitting an assert.
Fixes#5552
(Ref #5588 that was dequeued and fixed here)
Test: UUID_test, cql_query_test(debug)
"
* 'validate-time-uuid' of https://github.com/bhalevy/scylla:
cql3: abstract_function_selector: provide assignment_testable_source_context
test: cql_query_test: add time uuid validation tests
cql3: time_uuid_fcts: validate timestamp arg
cql3: make_max_timeuuid_fct: delete outdated FIXME comment
cql3: time_uuid_fcts: validate time UUID
test: UUID_test: add tests for time uuid
utils: UUID: create_time assert nanos_since validity
utils/UUID_gen: make_nanos_since
utils: UUID: assert UUID.is_timestamp
"
This PR makes named_value respect allowed_values and then use it to transition away from old deprecated RandomPartitioner and ByteOrderedPartitioner. Then it removes the code that's no longer used.
We want to remove deprecated partitioners because, on one hand, they lead to performance problems and hot nodes. Moreover, we're planning to unify the token representation which would allow per table partitioner support. That, in turn, is a feature helpful in multiple efforts like CDC, materialized views, secondary indexes and multi-tenancy.
tests: unit(dev)
"
* 'remove_deprecated_partitioners' of https://github.com/haaawk/scylla:
partitioners: remove random_partitioner
partitioners: Make it impossible to use RandomPartitioner
partitioners: remove byte_ordered_partitioner
partitioners: Make it impossible to use ByteOrderedPartitioner
partitioners: Remove leftovers of OrderPreservingPartitioner
i_partitioner.cc: stop including byte_ordered_partitioner.hh
i_partitioner.cc: stop including random_partitioner.hh
config: use allowed_values to verify named_value input
config: add operator<< for seed_provider_type
Since we now default to lld if present, and since lld is a faster
linker than either ld or gold, it makes sense to install it
as a dependency and to make it available as part of the frozen
toolchain.
"
Grab the lowest hanging fruits.
This patch-set makes three important changes:
* Consume the memory for I/O operations on tracked files, *before* they
are forwarded to the underlying file.
* Track memory consumed by buffers created for parsing in
`continuous_data_consumer`. As this is the basis for the data, index
and promoted index parsers, all three are covered now in this regard.
* Track the index file.
The remaining, not-so-low handing fruits in order of
gain/cost(performance) ratio:
* Track in-memory index lists.
* Track in-memory promoted index blocks.
* Track reader buffer memory.
Note that this ordering might change based on the workload and other
environmental factors.
Also included in this series is an infrastructure refactoring to make
tracking memory easier and involve including lighter headers, as well as
a manual test designed to allow testing and experimenting with the
effects of changes to the accuracy of the tracking of reader memory
consumption.
Refs: #4176
Refs: #2778
Tests: unit(dev), manual(sstable_scan_footprint_test)
The latter was run as:
build/dev/test/manual/sstable_scan_footprint_test -c1 -m2G --reads=4000
--read-concurrency=1 --logger-log-level test=trace --collect-stats
--stats-period-ms=20
This will trickle reads until the semaphore blocks, then wait until the
wait queue drains before sending new reads. This way we are not testing
the effectiveness of the pre-admission estimation (which is terribly
optimistic) and instead check that with slowly ramping up read load the
semaphore will block on memory preventing OOM.
This now runs to completion without a single `std::bad_alloc`. The read
concurrency semaphore allows between 15-30 reads, and is always blocked
on memory.
"
* 'more-accurate-reader-resource-tracking/v1' of ssh://github.com/denesb/scylla:
test/manual/sstable_scan_footprint_test: improve memory consumption diagnostics
tests/manual/sstable_scan_footprint_test: use the semaphore to determine read rate
tests/manual: Add test measuring memory demand of concurrent sstable reads
index_reader: make the index file tracked
sstables/continuous_data_consumer: track buffers used for parsing
reader_concurrency_semaphore: tracking_file_impl: consume memory speculatively
reader_concurrency_semaphore: bye reader_resource_tracker
treewide: replace reader_resource_tracer with reader_permit
reader_permit: expose make_tracked_temporary_buffer()
reader_permit: introduce make_tracked_file()
reader_permit: introduce memory_units
reader_concurrency_semaphore: mv reader_resources and reader_permit to reader_permit.hh
reader_concurrency_semaphore: reader_permit: make it a value type
reader_concurrency_semaphore: s/resources/reader_resources/
reader_concurrency_semaphore::reader_permit: move methods out-of-line
In Scylla all query processing activity should run under the
"statement" scheduling group. The scheduling group is
important for maintaining the balance between background and
foreground tasks in Scylla.
Testing: In order to test the correctness of the patch.
First, the following assert was inserted before any call
to one of the executor functions in the http route:
assert(current_scheduling_group().name() == "statement"
Then all alternator tests ran and passed.
The second stage was to change the name so the assert
will fail:
assert(current_scheduling_group().name() == "no-statement"
And ran the tests again - validating that Scylla coredumps.
The asserts were then removed.
Fixes#5008
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Message-Id: <20200127154341.10020-1-eliransin@scylladb.com>
"
There seem to be two problems with handling snapshot API -- one
on start and the other one on stop. Here's the set that addresses
both.
The fix moved snapshot API registration later in time that required
Amnon's ACK. Now we have it :) so -- the rebase and resend.
Tests: unit(dev), start-stop
"
* 'br-snapshot-bugs-2' of https://github.com/xemul/scylla:
snapshot: Pass requests through gate
api: Register snapshot API later
api: Unwrap wrap_ks_cf
"
With these changes and a binutils compiled with
--enable-deterministic-archives, the only difference I get in the
build directory if I build scylla twice from scratch are:
* The various CMakeError.log because they have temporary file names.
* The various CMakeOutput.log for the same reason.
* .ninja_log and .ninja_deps. I am not sure what the contents are.
"
* 'espindola/fix-determinism' of https://github.com/espindola/scylla:
build: remove timestamps from then antlr output
build: Make the output of idl-compiler deterministic
This depends on the patch
mk: avoid combining -r and -export-dynamic linker options
being added to dpdk.
I benchmarked this on top of my patches to get a reproducible build. I
first compiled with ccache, deleted the build directory and recompiled
so that all the "gcc -c" invocations were served by ccache. The times
of the second "ninja release" invocations were:
lld:
ninja release 155.68s user 71.89s system 2077% cpu 10.953 total
gold:
ninja release 953.79s user 254.71s system 2533% cpu 47.699 total
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200127171516.26268-1-espindola@scylladb.com>
> memory: add scoped_heap_profiling
> build: add switch to enable heap profiling support
> io_tester: do not abort on end of test
> resource: clean up cgroups version determination.
> prometheus: Silence a bogus gcc warning in http server
> Update dpdk submodule
> resource: Support cgroups v2
> net: Don't use variable length arrays
> core/memory.hh: document set_heap_profiling_enabled()
> Revert "net: Don't use variable length arrays"
> cmake: fix pkgconfig boost deps
> thread: Avoid confusing comment by switching value
> net: posix-stack: fix allocator in ap listening sockets
> net: posix-stack: fix passing allocator to new sockets
> stall_detector: Add a counter for stall detector report
> Merge "Don't use variable length arrays" from Rafael
> treewide: fix minor issues reported by clang
> thread: Call mprotect in make_stack
> thread: Always allocate stack with aligned_alloc
> build: Make SEASTAR_THREAD_STACK_GUARDS private
> thread: Move code out of a header
Merged patch series from Konstantin Osipov:
This series sets cql_repl core count to 1 and adds LWT
unit tests.
test.py: invoke cql_repl with smp=1
lwt: add lightweight transactions unit tests
Merged patch series from Piotr Sarna:
In order to minimize the usage of throws and catches in code paths
that are potentially hot, these paths instead return appropriate errors
directly.
The server layer is still able to catch and translate errors,
but the preferred way is to return api_error directly in places
that may be performance-sensitive.
Tests: alternator-test(local)
Fixes#5472
Piotr Sarna (3):
alternator: change request return type to variant<value, error>
alternator: elide throwing in condition checks
alternator: replace top-level throws with returns in executor
alternator/executor.hh | 28 ++++----
alternator/server.hh | 4 +-
alternator/executor.cc | 141 +++++++++++++++++++++--------------------
alternator/server.cc | 44 ++++++++-----
4 files changed, 117 insertions(+), 100 deletions(-)
Exclude cql_repl from the list of tests, since it's not a test.
Build it as a separate app. Do not strip, so that any CQL test
failure is easy to debug without a rebuild.
All test-related targets are converted from lists to sets to avoid
quadratic lookup cost in the check inside the loop which creates the
ninja file.
Currently --ami does not check instance types, creates invalid
io_properties.yaml on unsupported instance types.
It actually won't occur on AMI startup, since scylla_ami_setup only
invoke scylla_io_setup --ami when the instance is supported, so we don't
get the issue on startup, but we still get when we run scylla_io_setup
manually.
It's better to check instance type on scylla_io_setup, too.
Refs #5438
Conditional updates inform the user that the condition is not met
by returning an error. An initial implementation was based on rethrowing
these errors, but returning them directly is considered better
for performance.
This will prevent contention in case of parallel updates of the same row
by the same coordinator. The patch does it by introducing a new per key
lock map and taking it before running PAXOS protocol (either for write
of for read).
Message-Id: <20200117101228.GA14816@scylladb.com>
In order to minimize the use of exceptions during normal operations,
each request handler is now able to return either a proper JSON value,
or an instance of api_error, which indicates that something went wrong,
but without having to throw, catch and rethrow C++ exceptions.
This is especially important for conditional updates, since it's
expected to be common to return ConditionalCheckFailedException.
Message-Id: <d8996a0a270eb0d9db8fdcfb7046930b96781e69.1579515640.git.sarna@scylladb.com>
Docker restricts the number of processes in a container to some
limit it calculates. This limit turns out to be too low on large
machines, since we run multiple links in parallel, and each link
runs many threads.
Remove the limit by specifying --pids-limit -1. Since dbuild is
meant to provide a build environment, not a security barrier,
this is okay (the container is still restricted by host limits).
I checked that --pids-limit is supported by old versions of
docker and by podman.
Fixes#5651.
Message-Id: <20200127090807.3528561-1-avi@scylladb.com>
"
Final set of changes for full scan metrics.
- allow filtering
- full scan (Note: non-system tables only)
- full scan without BYPASS CACHE option
- tests for all metrics (bypass cache, allow filtering, full scan)
- works with prepared statements (tested, too)
"
* 'as_full_scan_metrics' of https://github.com/alecco/scylla:
Range scan query counter
Counter of queries doing full scan.
ALLOW FILTERING query counter
This test is all about tracking measured memory consumption vs. real
memory consumption. To make this easier add additional diagnostics:
* enable seastar heap profiler for the duration of the reads (seastar
has to be compiled with `-DSEASTAR_HEAPPROF`).
* Add a stats collector, which periodically collects stats such as
non-LSA free/used memory, LSA free/used memory and memory tracked by
the reader concurrency semaphore. These stats are written to a `.csv`
file, allowing importing them into a spreadsheet and processing them.
Currently the test fires the configured amount of reads at once. This is
somewhat restricting in the number of testable scenarios. For example,
it doesn't allow one to see if the semaphore correctly tracks the memory
consumption of existing reads, by firing new reads after a while.
Replace this algorithm by one which fires reads with a configured
concurrency, then waits for the semaphore's queue (if any) to drain,
before firing new reads. The test can now be configured with the total
amount of reads to fire, and with the read-concurrency, i.e. the number
of reads to fire at once in each iteration.
This allows for much greater flexibility in the different test
scenarios. The previous behaviour can still be achieved by configuring
a concurrency of 100.
This patch also adds better error handling. Reads are aborted on the
first error and errors are caught and not allowed to bubble up past the
test's main function and are logged instead.
Extensive logging is also added to be able to monitor the system while
the test is running.
Allow manual experimentation with the effectiveness of the accuracy of
the tracking of the resource consumption of readers, and hence the
system's ability to prevent overload and the dreaded `std::bad_alloc`.
This patch was originally developed by
Tomasz Grabiec <tgrabiec@scylladb.com>, I only adapted it to compile and
link on current master.
Based on heap profiling, buffers used for storing half-parsed fields are
a major contributor to the overall memory consumption of reads. This
memory was completely "under the radar" before. Track it by using
tracked `temporary_buffer` instances everywhere in
`continuous_data_consumer`. As `continuous_data_consumer` is the basis
for parsing all index and data files, adding the tracing here
automatically covers all data, index and promoted index parsing.
I'm almost convinced that there is a better place to store the `permit`
then the three places now, but so far I was unable to completely
decipher the our data/index file parsing class hierarchy.
Consume the memory before even submitting the I/O to the underlying
`file` object. This is in line with the underlying `file` object
allocating the buffer before it forwards the I/O request to the kernel.
This extends the "visibility" over the memory consumed by I/O greatly,
as it turns out buffers spend most time alive waiting for the I/O to
complete and are parsed shortly afterwards.
The former was never really more than a reader_permit with one
additional method. Currently using it doesn't even save one from any
includes. Now that readers will be using reader_permit we would have to
pass down both to mutation_source. Instead get rid of
reader_resource_tracker and just use reader_permit. Instead of making it
a last and optional parameter that is easy to ignore, make it a
first class parameter, right after schema, to signify that permits are
now a prominent part of the reader API.
This -- mostly mechanical -- patch essentially refactors mutation_source
to ask for the reader_permit instead of reader_resource_tracking and
updates all usage sites.
Previously `tracking_file_impl::make_tracked_buf()`. In the next patches
we plan on using this outside `tracking_file_impl`, so make it public
and templatize on the char type.
Similar to `seastar::semaphore_units`, this allows consuming and
releasing memory via an RAII object. In addition to that, it also allows
tracking changing values. This feature was designed to be used for
tracking the ever changing memory consumption of the buffers of
`flat_mutation_reader`:s.
This is now the only supported way of consuming memory from a permit.
In the next patches we will replace `reader_resource_tracker` and have
code use the `reader_permit` directly. In subsequent patches, the
`reader_permit` will get even more usages as we attempt to make the
tracking of reader resource more accurate by tracking more parts of it.
So the grand plan is that the current `reader_concurrency_semaphore.hh`
is split into two headers:
* `reader_concurrency_semaphore.hh` - containing the semaphore proper.
* `reader_permit.hh` - a very lightweight header, to be used by
components which only want to track various parts of the resource
consumption of reads.
Currently `reader_permit` is passed around as
`lw_shared_ptr<reader_permit>`, which is clunky to write and use and is
also an unnecessary leak of details on how permit ownership is managed.
Make `reader_permit` a simple value type, making it a little bit easier
and safer to use.
In the next patches we will get rid of `reader_resource_tracker` and
instead have code use the permit instance directly, so this small
improvement in usability will go a long way towards preventing eye sore.
In preparation for making the reader_permit a top-level class, and
moving it to another file. It is also good practice to define
non-performance critical methods out-of-line to reduce header bloat.
These unit tests cover all CQL aspects of lightweight transactions,
such as grammar, null semantics, batch semantics, result set
format, and so on.
For now, comment out unicode tests: test output depends
on libjsoncpp version in use.
When the scylla process is stopped no code waits for
current snapshot operations to finish. Also, the API
server is not stopped either, so new snapshot requests
can creep into.
In seastar there's a useful abstraction to address both.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
In storage_service's snapshot code there are checks for
_operation_mode being _not_ JOINING to proceed. The intention
is apparently to allow for snapshots only after the cluster
join. However, here's how the start-up code looks like
- _operation_mode = STARTING in storage_service::constructor
- snapshot API registered in api::set_server_storage_service
- _operation_mode = JOINING in storage_service::join_token_ring
So in between steps 2 and 3 snapshots can be taken.
Although there's a quick and simple fix for that (check for the
_operation_mode to be not STARTING either) I think it's better
to register the snapshot API later instead. This will help
greatly to de-bload the storage_service, in particular -- to
incapsulate the _operation_mode properly.
Note, though the check for _operation_mode is made only for
taking snapshot, I move all snapshot ops registration to the
later phase.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This is preparation for the next patch -- the lambda in
question (and the used type) will be needed in two
functions, so make the lambda a "real" function.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Make sure that the timestamp argument does not overflow
60 bits when converted to units of 100 nanos since
epoch, like with writetime() that returns microseconds since epoch
in contrast to other time functions like
unixtimestampof that return millis since epoch.
Fixes#5552
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Safely convert millis to "nanos_since" (number of 100
nanseconds since START_EPOCH) while type casting to uint64_t
to avoid possible int overflow.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
We need to add '~' to handle rcX version correctly on Debian variants
(merged at ae33e9f), but when we moved to relocated package we mistakenly
dropped the code, so add the code again.
Fixes#5641
The method view_info::partition_ranges() is unused.
Also drop the now-dead _partition_ranges data member.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
This is a fourth iteration of the patch series adding LWT usage
(instead of the old naive - and wrong - read before write) to
Alternator, as well as full support for the ConditionExpression
syntax for conditional updates.
Changes in v4:
* Rebased to most recent master
* Replaced 3 booleans which had 2^3 = 8 theoretical combinations,
by just 4 options in enum write_isolation:
FORBID_RMW, LWT_ALWAYS, LWT_RMW_ONLY, UNSAFE_RMW
The four options are described in details comments.
* Fix reversed assertion in FORBID_RMW case.
* Two new metrics: write_using_lwt and shard_bounce_for_lwt.
* Fail boot if alternator is enabled, but LWT isn't.
* Add information about enabling LWT in docs/alternator/alternator.md
* nyh/v4-lwt:
alternator: add support for ConditionExpression
alternator: reimplement read-modify-write operations using LWT
alternator: make "executor" a peering_sharded_service
Implements a counter of executions of SELECT queries with ALLOW FILTERING option.
In scope of #5209
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Previous patch makes it impossible to configure Scylla
with RandomPartitioner so this code is effectively dead now.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
RandomPartitioner has been deprecated for 2.5 year.
Now we drop the support for it. There are two reasons for this.
First, this partitioner can lead to uneven distribution of partitions
among the nodes in the cluster which leads to hot nodes.
Second, we're planning to unify the representation of tokens and
fix it as int64_t. RandomPartitioner does not comply with this.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Previous patch makes it impossible to configure Scylla
with ByteOrderedPartitioner so this code is effectively dead now.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
ByteOrderedPartitioner has been deprecated for 2.5 year.
Now we drop the support for it. There are two reasons for this.
First, this partitioner can lead to uneven distribution of partitions
among the nodes in the cluster which leads to hot nodes.
Second, we're planning to unify the representation of tokens and
fix it as int64_t. ByteOrderPartitioner does not comply with this.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
OrderPreservingPartitioner seems to be long gone and not supported
so remove all the places it's still mentioned.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Even though we configure the set of accepted values for
some config flags, named_value ignore them.
This patch implements the checks that verify flag is
not set to the value that's not on the list.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
This patch adds support for the ConditionExpression parameter of the
item-writing operations in Alternator: PutItem, UpdateItem and DeleteItem.
We already supported conditional updates/put/delete using the "Expected"
parameter. The ConditionExpression parameter implemented here provides a
very similar feature, using a different - and also newer and more powerful -
syntax.
The implementation here reuses much of our existing expression-parsing
infrastructure. Unsurprisingly, ConditionExpression's syntax has much in
common with UpdateExpression which we already support) and also many of the
comparison functions already implemented for "Expected". However, it's still
quite a bit of new code, because of the many different comparisons, functions,
and syntax variations we need to support.
This patch also expands alternator-test/test_condition_expression.py with
a few additional corner cases discovered during the development of this
patch.
Almost all of the tests for this feature (35 out of 39) now pass.
Two tests still fail because we don't yet support nested attributes (this
is a missing feature across Alternator), and two tests fail because of minor
ideosyncracies in DynamoDB's error path that we chose not to duplicate
yet (but still remember the difference in the form of an xfailing test).
Fixes#5035
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
In this patch, we re-implement the three read-modify-write operations -
PutItem, UpdateItem, DeleteItem. All three operations may need to read the
item before writing it to support conditional updates (the "Expected"
parameter) and UpdateItem may also need the previous item's value for
its update expression (e.g., a user may ask to "set a=a+1" or "set a=b").
Before this patch, the implementation of RMW operations simply did a read,
and then a write - without any attempt to protect concurrent operations.
In this patch, Scylla's LWT mechanism (storage_proxy::cas()) is used
instead, to ensure that concurrent update operations are correctly
isolated even if they are conditional. This means that Alternator now
requires the experimental LWT feature to be enabled (and refuses to
boot if it isn't).
The version presented here is configured to always use LWT for *every*
write, regardless of whether it has a condition or not. So it will
will significantly slow down write-only workloads like YCSB. But the code
in this patch actually includes three other modes, which can be chosen by
setting an enum constant in the code. In the future we will want to let the
user configure this mode, globally, per table or per attribute.
Note that read requests are NOT modified, and work exactly as they did
before: i.e., strongly-consistent reads are done using a normal
CL=LOCAL_QUORUM read - not via LWT. I believe this is good enough given
Dynamo's guarantees, and critical for our read performance.
Also note that patch doesn't yet fix the BatchWriteItem operation.
Although BatchWriteItem does not support any RMW operations - just pure
writes - we may still need to do those pure writes using LWT. This
should be fixed in a follow-up patch.
Unfortunately, this patch involves a large amount of code movement and
reorganization, because:
1. The cas operation requires each operation to be made into an object,
with a separate apply() function, forcing a lot of code to move.
2. Moreover, we need to do this for three different operations (PutItem,
UpdateItem, DeleteItem) so to avoid massive code duplication, I had
to move some common code.
3. The cas operation also forced us to change some of the utility functions'
APIs.
The end result is that this patch focuses more on a compact and
understandable *end result* than it does on an easy to understand *patch*,
so reviewers - sorry about that.
All alternator-test/ tests pass with this patch (and also with all of the
different optional modes enabled). However, other than that, I did not yet
do any real isolation tests (are concurrent operations really isolated
correctly? or is LWT just faking it? :-) ), performance tests or stress
tests - and I'll definitely need to do those as well.
Fixes#5054
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Alternator uses a sharded<executor> for handling execution of Alternator
requests on different shards. In this patch we make executor a subclass of
peering_sharded_service, to allow one of these executors to run an exector
method on a different shard: Any one of the shard-local executor instances
can call container() to get the full sharded<executor>.
We will need this capability later, when we need to bounce requests between
shards because of requirements of the storage_proxy::cas (LWT) code.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Following patch will start checking allowed_values
in named_value and print errors for wrong values.
This will require all the types used with named_value
to have operator<< implemented. seed_provider_type
is one such type.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
The output of antrl always has the timestamp of when it was
created. This expands the existing sed hack to remove that too.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
If at any point during the topological sort we had more than one node
with zero dependencies, the order they were printed was not
deterministic.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
"
This series refactors the code used by migration_notifier and gossiper
into an atomic_vector type.
"
* 'espindola/gossiper_atomic_vector' of https://github.com/espindola/scylla:
gossiper: Store subscribers in an atomic_vector
load_broadcaster: Unregister from load_broadcaster::stop_broadcasting
repair: add row_level::stop()
locator: Return future from i_endpoint_snitch::reload_gossiper_state
service: Refactor code into a atomic_vector class
migration_manager: Fix typo
load_meter: Use a shared_ptr to store a load_broadcaster
The new guarantees are a bit better IMHO:
Once a subscriber is removed, it is never notified. This was not true
in the old code since it would iterate over a copy that would still
have that subscriber.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This templates the code for listener_vector, renames it to
atomic_vector and moves it to the utils directory.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
load_broadcaster::stop_broadcasting uses shared_from_this(). Since
that is the only reference that the produced shared_ptr knows of, it
is deleted immediately. Fix that by also using a shared_ptr in
load_meter.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
* seastar afc46681...147d50b1 (6):
> perftune.py: Use safe_load() for fix arbitrary code execution
Fixes#5630
> clang: current_exception_as_future must be in namespaced
> tests: add an expected failures version of thread fixture
> Enable stack guards in Dev builds
> net: posix: Introduce load_balancing_algorithm::fixed
> stream: Move _next from subscription to stream
`parsed_statement::get_bound_variables` is assumed to always
return a nonnull pointer to `variable_specifications` instance.
In this case using a pointer is superfluous and can be safely
replaced by a plain reference.
Also add a default ctor and a utility method `set_bound_variables`
to the `variable_specifications` class to actually reset the
contents of the class instance.
Tests: unit(dev, debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200120195839.164296-1-pa.solodovnikov@scylladb.com>
"
As a followup to 0bde590
This series implements suggestions from @avikivity and @espindola
It simplifies the template definitions for accumulator_for,
adds some debug logging for the overflow values,
and adds unit tests for float and double sum overflow.
Test: unit(dev),
paging_test:TestPagingWithIndexingAndAggregation.test_filter_{indexed,non_indexed,pk}_column(dev)
"
* 'simplify-sum-overflow' of https://github.com/bhalevy/scylla:
test: cql_query_test: test float/double sum overflow
cql3: aggregate_fcts: simplify accumulator_for template definitions
It is the only place where get_changed_ranges_for_leaving is not running
inside a thread. Preparing patch to futurize get_changed_ranges_for_leaving.
Refs: #5495
A mistake in handling legacy checks for special 'idx_token' column
resulted in not recognizing materialized views backing secondary
indexes properly. The mistake is really a typo, but with bad
consequences - instead of checking the view schema for being an index,
we asked for the base schema, which is definitely not an index of
itself.
Branches 3.1,3.2 (asap)
Fixes#5621Fixes#4744
Before this patch the iterations over migration_notifier::_listeners
could race with listeners being added and removed.
The addition side is not modified, since it is common to add a
listener during construction and it would require a fairly big
refactoring. Instead, the iteration is modified to use indexes instead
of iterators so that it is still valid if another listener is added
concurrently.
For removal we use a rw lock, since removing an element invalidates
indexes too. There are only a few places that needed refactoring to
handle unregister_listener returning a future<>, so this is probably
OK.
Fixes#5541.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200120192819.136305-1-espindola@scylladb.com>
"
This series fixes all abandoned failed futures in cql_query_test and
starts running it with --fail-on-abandoned-failed-futures to avoid
regressions.
"
* 'espindola/fix-abandoned-failed-futures' of https://github.com/espindola/scylla:
cql_query_test: Avoid new abandoned failed futures
cql_query_test: Explicitly ignore a failed future
cql_query_test: Remove duplicated do_with_cql_env_thread
cql_query_test: Fix cql and values in test_int_sum_with_cast
Now that cql_query_test has no abandoned failed futures, run it with
--fail-on-abandoned-failed-futures to avoid regressions.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This test is not running because of the double
do_with_cql_env_thread. Fix it before we remove the extra
do_with_cql_env_thread.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
* seastar 3f3e117de3...afc46681e5 (7):
> json: add move assignment to json_return_type
> net: do not check if an unsigned variabe is less than 0
> stack: add virtual destructor definition for class w/ virtual functions
> future,json: add ":" at end of concept definition
> Fixing a bug in the handling of abort_accept()
> install-dependencies.sh: improve arch detect
> metrics: Avoid a copy during unregistration
We have numerous tests that rely on the seastar alloc failure injection
infrastructure to test the exception safety of different components.
These tests are essentially useless when the said infrastructure is
not enabled, which is currently the case for all build modes, allowing
bugs to sneak in undetected.
Enable the allocation failure injection infrastructure for the dev and
debug modes. Sanitize is excluded as it produces some (suspected false
positive) failures and is not run in gating either currently.
Tests: unit(dev, debug)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200117104747.748866-1-bdenes@scylladb.com>
Merged patch series from Piotr Sarna:
This miniseries adds two simple prerequisites for implementing tagging:
1. A table is able to generate its Arn identifier
2. Simple tests for TagResource, UntagResource, ListTagsOfResource
In general, tags should be stored in table metadata - either by
expanding the schema of an existing schema table, e.g. scylla_tables,
or by providing another meta-table - e.g. system_schema.alternator_tables,
which stores alternator-specific metadata, like tags.
Refs #5066
Tests: alternator-test(local, remote)
Piotr Sarna (2):
alternator: add Arn support for tables
alternator-test: add basic tests for tags
alternator-test/test_describe_table.py | 1 -
alternator-test/test_tag.py | 88 ++++++++++++++++++++++++++
alternator/executor.cc | 5 ++
3 files changed, 93 insertions(+), 1 deletion(-)
create mode 100644 alternator-test/test_tag.py
Several API-s, e.g. TagResource, UntagResource and ListTagsOfResource
rely on identifying tables by their "Arn". According to the docs,
an Arn should uniquely identify a resource, so it's implemented as:
arn:KEYSPACE_NAME:TABLE_NAME
which is a minimal set of information that uniquely identifies a table
in Scylla. The `arn:` prefix is needed for compatibility purposes.
This commit adds a simple function for generating the Arn string,
and also includes it in DescribeTable result under the TableArn attribute.
Refs #5066
Add a name parameter to the validator, so that the validator can be
identified in log messages. Schema identity information is added to the
name automatically. This should help pinpoint the problematic place
where validation failed.
Although at the moment we have a single validator, it still benefits
from having a name, as we can now include in it the name of the sstable
being written and hence trace the source of the bad data.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200117150616.895878-1-bdenes@scylladb.com>
Update Packer to 1.5.1.
Needed to rename clean_ami_name -> clean_resource_name on scylla.json, since
the variable name had been changed.
Also fixed checksum verification code, trimmed unwanted extra strings
from sha256sum output.
Since we need to run relocate_python_scripts.py on install time,
python script may not able to run on various different environment.
So convert the script to bash script, merge it into install.sh.
When a node does not have gossip STATUS application_state, we currently
use an empty string to present such state in get_gossip_status.
It is better to use an explicit "UNKNOWN" to present it. It makes the
log easier to understand when the status is unknown.
Before:
'gossip - InetAddress n2 is now UP, status ='
After:
'gossip - InetAddress n2 is now UP, status = UNKNOWN'
This patch is safe because the STATUS_UNKNOWN is never sent over the
cluster. So the presentation is only internal to the node.
Fixes#5520
The atomic_cell pretty printers use a mix of commas and semicolons.
This change makes them use commas everywhere, for consistency.
Message-Id: <20200116133327.2610280-1-avi@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5567
from Calle Wilund:
Fixes#5314
Instead of tying CDC handling into cql statement objects, this patch set
moves it to storage proxy, i.e. shared code for mutating stuff. This means
we automatically handle cdc for code paths outside cql (i.e. alternator).
It also adds api handling (though initially inefficient) for batch statements.
CDC is tied into storage proxy by giving the former a ref to the latter (per
shard). Initially this is not a constructor parameter, because right now we
have chicken and egg issues here. Hopefully, Pavels refactoring of migration
manager and notifications will untie these and this relationship can become
nicer.
The actual augmentation can (as stated above) be made much more efficient.
Hopefully, the stream management refactoring will deal with expensive stream
lookup, and eventually, we can maybe coalesce pre-image selects for batches.
However, that is left as an exercise for when deemed needed.
The augmentation API has an optional return value for a "post-image handler"
to be used iff returned after mutation call is finished (and successful).
It is not yet actually invoked from storage_proxy, but it is at least in the
call chain.
The set make dependencies between mm and other services cleaner,
in particular, after the set:
- the query processor no longer needs migration manager
(which doesn't need query processor either)
- the database no longer needs migration manager, thus the mutual
dependency between these two is dropped, only migration manager
-> database is left
- the migration manager -> storage_service dependency is relaxed,
one more patchset will be needed to remove it, thus dropping one
more mutual dependency between them, only the storage_service
-> migration manager will be left
- the migration manager is stopped on drain, but several more
services need it on stop, thus causing use after free problems,
in particular there's a caught bug when view builder crashes
when unregistering from notifier list on stop. Fixed.
Tests: unit(dev)
Fixes: #5404
Enabling asan enables a few cleanup optimizations in gcc. The net
result is that using
-fsanitize=address -fno-sanitize-address-use-after-scope
Produces code that uses a lot less stack than if the file is compiled
with just -O0.
This patch adds -O1 in addition to
-fno-sanitize-address-use-after-scope to protect the unfortunate
developer that decides to build in dev mode with --cflags='-O0 -g'.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200116012318.361732-2-espindola@scylladb.com>
It is sometimes convenient to build with flags that don't match any
existing mode.
Recently I was tracking a bug that would not reproduce with debug, but
reproduced with dev, so I tried debugging the result of
./configure.py --cflags="-O0 -g"
While the binary had debug info, it still had optimizations because
configure.py put the mode flags after the user flags (-O0 -O1). This
patch flips the order (-O1 -O0) so that the flags passed in the
command line win.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200116012318.361732-1-espindola@scylladb.com>
CQL transport code relies on an exception's C++ type to create correct
reply, but in lwt we converted some mutation_timeout exceptions to more
generic request_timeout while forwarding them which broke the protocol.
Do not drop type information.
Fixes#5598.
Message-Id: <20200115180313.GQ9084@scylladb.com>
Murmur3 is the default partitioner.
ByteOrder and Random are the deprecated ones
and should be mentioned in the description.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5294 from
Amnon Heiman:
To use a snapshot we need a schema file that is similar to the result of
running cql DESCRIBE command.
The DESCRIBE is implemented in the cql driver so the functionality needs
to be re-implemented inside scylla.
This series adds a describe method to the schema file and use it when doing
a snapshot.
There are different approach of how to handle materialize views and
secondary indexes.
This implementation creates each schema.cql file in its own relevant
directory, so the schema for materializing view, for example, will be
placed in the snapshot directory of the table of that view.
Fixes#4192
This commit makes most sleeps in gossip.cc abortable. It is now possible
to quickly shut down a node during startup, most notably during the
phase while it waits for gossip to settle.
This reduces network traffic and eliminates time for installation when
building packages from the frozen toolchain, as well as isolating the
build from updates to those dependencies which may cause breakage.
This patch set adds support for CQL tests to test.py,
as well as many other improvements:
* --name is now a positional argument
* test output is preserved in testlog/${mode}
* concise output format
* better color support
* arbitrary number of test suites
* per-suite yaml-based configuration
* options --jenkins and --xunit are removed and xml
files are generated for all runs
A simple driver is written in C++ to read CQL for
standard input, execute in embedded mode and produce output.
The patch is checked with BYO.
Reviewed-by: Dejan Mircevski <dejan@scylladb.com>
* 'test.py' of github.com:/scylladb/scylla-dev: (39 commits)
test.py: introduce BoostTest and virtualize custom boost arguments
test.py: sort tests within a suite, and sort suites
test.py: add a basic CQL test
test.py: add CQL .reject files to gitignore
test.py: print a colored unidiff in case of test failure
test.py: add CqlTestSuite to run CQL tests
test.py: initial import of CQL test driver, cql_repl
test.py: remove custom colors and define a color palette
test.py: split test output per test mode
test.py: remove tests_to_run
test.py: virtualize Test.run(), to introduce CqlTest.Run next
test.py: virtualize test search pattern per TestSuite
test.py: virtualize write_xunit_report()
test.py: ensure print_summary() is agnostic of test type
test.py: tidy up print_summary()
test.py: introduce base class Test for CQL and Unit tests
test.py: move the default arguments handling to UnitTestSuite
test.py: move custom unit test command line arguments to suite.yaml
test.py: move command line argument processing to UnitTestSuite
test.py: introduce add_test(), which is suite-specific
...
The Ubuntu-based Docker image uses Scylla 1.0 and has not been updated
since 2017. Let's remove it as unmaintained.
Message-Id: <20200115102405.23567-1-penberg@scylladb.com>
"
Currently commitlog supports two modes of operation. First is 'periodic'
mode where all commitlog writes are ready the moment they are stored in
a memory buffer and the memory buffer is flushed to a storage periodically.
Second is a 'batch' mode where each write is flushed as soon as possible
(after previous flush completed) and writes are only ready after they
are flushed.
The first option is not very durable, the second is not very efficient.
This series adds an option to mark some writes as "more durable" in
periodic mode meaning that they will be flushed immediately and reported
complete only after the flush is complete (flushing a durable write also
flushes all writes that came before it). It also changes paxos to use
those durable writes to store paxos state.
Note that strictly speaking the last patch is not needed since after
writing to an actual table the code updates paxos table and the later
uses durable writes that make sure all previous writes are flushed. Given
that both writes supposed to run on the same shard this should be enough.
But it feels right to make base table writes durable as well.
"
* 'gleb/commilog_sync_v4' of github.com:scylladb/seastar-dev:
paxos: immediately sync commitlog entries for writes made by paxos learn stage
paxos: mark paxos table schema as "always sync"
schema: allow schema to be marked as 'always sync to commitlog'
commitlog: add test for per entry sync mode
database: pass sync flag from db::apply function to the commitlog
commitlog: add sync method to entry_writer
Before this patch result_set_assertions was handling both null values
and missing values in the same way.
This patch changes the handling of missing values so that now checking
for a null value is not the same as checking for a value not being
present.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200114184116.75546-1-espindola@scylladb.com>
3ec889816 changed cell::make_collection() to take different code paths
depending whether its `data` argument is nothrow copyable/movable or
not. In case it is not, it is wrapped in a view to make it so (see the
above mentioned commit for a full explanation), relying on the methods
pre-existing requirement for callers to keep `data` alive while the
created writer is in use.
On closer look however it turns out that this requirement is neither
respected, nor enforced, at least not on the code level. The real
requirement is that the underlying data represented by `data` is kept
alive. If `data` is a view, it is not expected to be kept alive and
callers don't, it is instead copied into `make_collection()`.
Non-views however *are* expected to be kept alive. This makes the API
error prone.
To avoid any future errors due to this ambiguity, require all `data`
arguments to be nothrow copyable and movable. Callers are now required
to pass views of nonconforming objects.
This patch is a usability improvement and is not fixing a bug. The
current code works as-is because it happens to conform to the underlying
requirements.
Refs: #5575
Refs: #5341
Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200115084520.206947-1-bdenes@scylladb.com>
This patch adds tests for the describe method.
test_describe_simple_schema tests regular tables.
test_describe_view_schema tests view and index.
Each test, create a table, find the schema, call the describe method and
compare the results to the string that was used to create the table.
The view tests also verify that adding an index or view does not change
the base table.
When comparing results, leading and trailing white spaces are ignored
and all combination of whitespaces and new lines are treated equaly.
Additional tests may be added at a future phase if required.
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
When creating a snapshot we need to add a schema.cql file in the
snapshot directory that describes the table in that snapshot.
This patch adds the file using the schema describe method.
get_snapshot_details and manifest_json_filter were modified to ignore
the schema.cql file.
Fixes#4192
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
This patch adds a describe method to a table schema.
It acts similar to a DESCRIBE cql command that is implemented in a CQL
driver.
The method supports tables, secondary indexes local indexes and
materialize views.
relates to: #4192
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
index_name_from_table_name is a reverse of index_table_name,
it gets a table name that was generated for an index and return the name
of the index that generated that table.
Relates to #4192
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
The factory is purely a state-less thing, there is no difference what
instance of it to use, so we may omit referencing the storage_service
in passive_announce
This is 2nd simple migration_manager -> storage_service link to cut
(more to come later).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
There are several places where migration_manager needs storage_service
reference to get the database from, thus forming the mutual dependency
between them. This is the simplest case where the migration_manager
link to the storage_service can be cut -- the databse reference can be
obtained from storage_proxy instead.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This is the last place where database code needs the migration_manager
instance to be alive, so now the mutual dependency between these two
is gone, only the migration_manager needs the database, but not the
vice-versa.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The storage_server needs migration_manager for notifications and
carefully handles the manager's stop process not to demolish the
listeners list from under itself. From now on this dependency is
no longer valid (however the storage_service seems still need the
migration_manager, but this is different story).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This patch removes an implicit cql_server -> migration_manager
dependency, as the former's event notifier uses the latter
for notifications.
This dependency also breaks a loop:
storage_service -> cql_server -> migration_manager -> storage_service
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This patch breaks one (probably harmless but still) dependency
loop. The query_processor -> migration_manager -> storage_proxy
-> tracing -> query_processor.
The first link is not not needed, as the query_processor needs the
migration_manager purely to (ub)subscribe on notifications.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The same as with view builder. The constructor still needs both,
but the life-time reference is now for notifier only.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The migration manager itself is still needed on start to wait
for schema agreement, but there's no longer the need for the
life-time reference on it.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Do not call for local migration manager instance to send notifications,
call for the local migration notifier, it will always be alive.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The storage service will need this guy to initialize sub-services
with. Also it registers itself with notifiers.
That said, it's convenient to have the migration notifier on board.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The _listeners list on migration_manager class and the corresponding
notify_xxx helpers have nothing to do with the its instances, they
are just transport for notification delivery.
At the same time some services need the migration manager to be alive
at their stop time to unregister from it, while the manager itself
may need them for its needs.
The proposal is to move the migration notifier into a complete separate
sharded "service". This service doesn't need anything, so it's started
first and stopped last.
While it's not effectively a "migration" notifier, we inherited the name
from Cassandra and renaming it will "scramble neurons in the old-timers'
brains but will make it easier for newcomers" as Avi says.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The comparator is refreshed to ensure the following:
- null compares less to all other types;
- null, true and false are comparable against each other,
while other types are only comparable against themselves and null.
Comparing mixed types is not currently reachable from the alternator
API, because it's only used for sets, which can only use
strings, binary blobs and numbers - thus, no new pytest cases are added.
Fixes#5454
When counter mutation is about to be sent, a leader is elected, but
if the leader fails after election, we get `rpc::closed_error`. The
exception propagates high up, causing all connections to be dropped.
This patch intercepts `rpc::closed_error` in `storage_proxy::mutate_counters`
and translates it to `mutation_write_failure_exception`.
References #2859
Run the test and compare results. Manage temporary
and .reject files.
Now that there are CQL tests, improve logging.
run_test success no longer means test success.
cql_repl is a simple program which reads CQL from stdin,
executes it, and writes results to stdout.
It support --input, --output and --log options.
--log is directed to cql_test.log by default.
--input is stdin by default
--output is stdout by default.
The result set output is print with a basic
JSON visitor.
Store test temporary files and logs in ${testdir}/${mode}.
Remove --jenkins and --xunit, and always write XML
files at a predefined location: ${testdir}/${mode}/xml/.
Use .xunit.xml extension for tests which XML output is
in xunit format, and junit.xml for an accumulated output
of all non-boost tests in junit format.
Load the command line arguments, if any, from suite.yaml, rather
than keep them hard-coded in test.py.
This is allows operations team to have easier access to these.
Note I had to sacrifice dynamic smp count for mutation_reader_test
(the new smp count is fixed at 3) since this is part
of test configuration now.
This way we can avoid iterating over all tests
to handle --repeat.
Besides, going forward the tests will be stored
in two places: in the global list of all tests,
for the runner, and per suite, for suite-based
reporting, so it's easier if TestSuite
if fully responsible for finding and adding tests.
Scan entire test/ for folders that contain suite.yaml,
and load tests from these folders. Skip the rest.
Each folder with a suite.yaml is expected to have a valid
suite configuration in the yaml file.
A suite is a folder with test of the same type. E.g.
it can be a folder with unit tests, boost tests, or CQL
tests.
The harness will use suite.yaml to create an appropriate
suite test driver, to execute tests in different formats.
It reduces the number of configurations to re-test when test.py is
modified. and simplifies usage of test.py in build tools, since you no
longer need to bother with extra arguments.
Going forward I'd like to make terminal output brief&tabular,
but some test details are necessary to preserve so that a failure
is easy to debug. This information now goes to the log file.
- open and truncate the log file on each harness start
- log options of each invoked test in the log, so that
a failure is easy to reproduce
- log test result in the log
Since tests are run concurrently, having an exact
trace of concurrent execution also helps
debugging flaky tests.
The storage_service struct is a collection of diverse things,
most of them requiring only on start and on stop and/or runing
on shard 0 (but is nonetheless sharded).
As a part of clearing this structure and generated by it inter-
-componenes dependencies, here's the sanitation of load_broadcaster.
Fixes#5582
... but only populate log on shard 0.
Migration manager callbacks are slightly assymetric. Notifications
for pre-create/update mutations are sent only on initiating shard
(neccesary, because we consider the mutations mutable).
But "created" callbacks are sent on all shards (immutable).
We must subscribe on all shards, but still do population of cdc table
only once, otherwise we can either miss table creat or populate
more than once.
v2:
- Add test case
Message-Id: <20200113140524.14890-1-calle@scylladb.com>
* seastar 36cf5c5ff0...3f3e117de3 (16):
> memcached: don't use C++17-only std::optional
> reactor: Comment why _backend is assigned in constructor body
> log: restore --log-to-stdout for backward compatibility
> used_size.hh: Include missing headers
> core: Move some code from reactor.cc to future.cc
> future-util: move parallel_for_each to future-util.cc
> task: stop wrapping tasks with unique_ptr
> Merge "Setup timer signal handler in backend constructor" from Pavel
Fixes#5524
> future: avoid a branch in future's move constructor if type is trivial
> utils: Expose used_size
> stream: Call get_future early
> future-util: Move parallel_for_each_state code to a .cc
> memcached: log exceptions
> stream: Delete dead code
> core: Turn pollable_fd into a simple proxy over pollable_fd_state.
> Merge "log to std::cerr" from Benny
This is the part of de-bloating storage_service.
The field in question is used to temporary keep the _token_metadata
value during shard-wide replication. There's no need to have it as
class member, any "local" copy is enough.
Also, as the size of token_metadata is huge, and invoke_on_all()
copies the function for each shard, keep one local copy of metadata
using do_with() and pass it into the invoke_on_all() by reference.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Reviewed-by: Asias He <asias@scylladb.com>
Message-Id: <20200113171657.10246-1-xemul@scylladb.com>
The query option always_return_static_content was added for lightweight
transations in commits e0b31dd273 (infrastructure) and 65b86d155e
(actual use). However, the flag was added unconditionally to
update_parameters::options. This caused it to be set for list
read-modify-write operations, not just for lightweight transactions.
This is a little wasteful, and worse, it breaks compatibility as old
nodes do not understand the always_return_static_content flag and
complain when they see it.
To fix, remove the always_return_static_content from
update_parameters::options and only set it from compare-and-swap
operations that are used to implement lightweight transactions.
Fixes#5593.
Reviewed-by: Gleb Natapov <gleb@scylladb.com>
Message-Id: <20200114135133.2338238-1-avi@scylladb.com>
The drain_in_progress variable here is the future that's set by the
drain() operation itself. Its promise is set when the drain() finishes.
The check for this future in the beginning of drain() is pointless.
No two drain()-s can run in parallels because of run_with_api_lock()
protection. Doing the 2nd drain after successfull 1st one is also
impossible due to the _operation_mode check. The 2nd drain after
_exceptioned_ (and thus incomplete) 1st one will deadlock, after
this patch will try to drain for the 2nd time, but that should by ok.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200114094724.23876-1-xemul@scylladb.com>
This change introduces system.clients table, which provides
information about CQL clients connected.
PK is the client's IP address, CK consists of outgoing port number
and client_type (which will be extended in future to thrift/alternator/redis).
Table supplies also shard_id and username. Other columns,
like connection_stage, driver_name, driver_version...,
are currently empty but exist for C* compatibility and future use.
This is an ordinary table (i.e. non-virtual) and it's updated upon
accepting connections. This is also why C*'s column request_count
was not introduced. In case of abrupt DB stop, the table should not persist,
so it's being truncated on startup.
Resolves#4820
"
Most of the code in `cell` and the `imr` infrastructure it is built on
is `noexcept`. This means that extra care must be taken to avoid rouge
exceptions as they will bring down the node. The changes introduced by
0a453e5d3a did just that - introduced rouge `std::bad_alloc` into this
code path by violating an undocumented and unvalidated assumption --
that fragment ranges passed to `cell::make_collection()` are nothrow
copyable and movable.
This series refactors `cell::make_collection()` such that it does not
have this assumption anymore and is safe to use with any range.
Note that the unit test included in this series, that was used to find
all the possible exception sources will not be currently run in any of
our build modes, due to `SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION` not
being set. I plan to address this in a followup because setting this
flags fails other tests using the failure injection mechanism. This is
because these tests are normally run with the failure injection disabled
so failures managed to lurk in without anyone noticing.
Fixes: #5575
Refs: #5341
Tests: unit(dev, debug)
"
* 'data-cell-make-collection-exception-safety/v2' of https://github.com/denesb/scylla:
test: mutation_test: add exception safety test for large collection serialization
data/cell.hh: avoid accidental copies of non-nothrow copiable ranges
utils/fragment_range.hh: introduce fragment_range_view
We do not yet support the ScanIndexForward=false option for reversing
the sort order of a Query operation, as reported in issue #5153.
But even before implementing this feature, it is important that we
produce an error if a user attempts to use it - instead of outright
ignoring this parameter and giving the user wrong results. This is
what this patch does.
Before this patch, the reverse-order query in the xfailing test
test_query.py::test_query_reverse seems to succeed - yet gives
results in the wrong order. With this patch, the query itself fails -
stating that the ScanIndexForward=false argument is not supported.
Refs #5153
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200105113719.26326-1-nyh@scylladb.com>
Here's another theoretical problem, that involves 3 sequential calls
to respectively removenode, force_removenode and some other operation.
Let's walk through them
First goes the removenode:
run_with_api_lock
_operation_in_progress = "removenode"
storage_service::remove_node
sleep in replicating_nodes.empty() loop
Now the force_removenode can run:
run_with_no_api_lock
storage_service::force_removenode
check _operation_in_progress (not empty)
_force_remove_completion = true
sleep in _operation_in_progress.empty loop
Now the 1st call wakes up and:
if _force_remove_completion == true
throw <some exception>
.finally() handler in run_with_api_lock
_operation_in_progress = <empty>
At this point some other operation may start. Say, drain:
run_with_api_lock
_operation_in_progress = "drain"
storage_service::drain
...
go to sleep somewhere
No let's go back to the 1st op that wakes up from its sleep.
The code it executes is
while (!ss._operation_in_progress.empty()) {
sleep_abortable()
}
and while the drain is running it will never exit.
However (! and this is the core of the race) should the drain
operation happen _before_ the force_removenode, another check
for _operation_in_progress would have made the latter exit with
the "Operation drain is in progress, try again" message.
Fix this inconsistency by making the check for current operation
every wake-up from the sleep_abortable.
Fixes#5591
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Here's a theoretical problem, that involves 3 sequential calls
to respectively removenode, force_removenode and removenode (again)
operations. Let's walk through them
First goes the removenode:
run_with_api_lock
_operation_in_progress = "removenode"
storage_service::remove_node
sleep in replicating_nodes.empty() loop
Now the force_removenode can run:
run_with_no_api_lock
storage_service::force_removenode
check _operation_in_progress (not empty)
_force_remove_completion = true
sleep in _operation_in_progress.empty loop
Now the 1st call wakes up and:
if _force_remove_completion == true
_force_remove_completion = false
throw <some exception>
.finally() handler in run_with_api_lock
_operation_in_progress = <empty>
! at this point we have _force_remove_completion = false and
_operation_in_progress = <empty>, which opens the following
opportunity for the 3d removenode:
run_with_api_lock
_operation_in_progress = "removenode"
storage_service::remove_node
sleep in replicating_nodes.empty() loop
Now here's what we have in 2nd and 3rd ops:
1. _operation_in_progress = "removenode" (set by 3rd) prevents the
force_removenode from exiting its loop
2. _force_remove_completion = false (set by 1st on exit) prevents
the removenode from waiting on replicating_nodes list
One can start the 4th call with force_removenode, it will proceed and
wake up the 3rd op, but after it we'll have two force_removenode-s
running in parallel and killing each other.
I propose not to set _force_remove_completion to false in removenode,
but just exit and let the owner of this flag unset it once it gets
the control back.
Fixes#5590
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Other types do not have a wider accumulator at the moment.
And static_cast<accumulator_type>(ret) != _sum evaluates as
false for NaN/Inf floating point values.
Fixes#5586
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200112183436.77951-1-bhalevy@scylladb.com>
Now that atomic_cell_view and collection_mutation_view have
type-aware printers, we can use them in the type-aware atomic_cell_or_collection
printer.
Message-Id: <20191231142832.594960-1-avi@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5533
from Avi Kivity:
canonical_mutation objects are used for schema reconciliation, which is a
fragile area and thus deserves some debugging help.
This series makes canonical_mutation objects printable.
Merged patch series from Piotr Sarna:
"Previous assumption was that there can only be one regular base column
in the view key. The assumption is still correct for tables created
via CQL, but it's internally possible to create a view with multiple
such columns - the new assumption is that if there are multiple columns,
they share their liveness.
This series is vital for indexing to work properly on alternator,
so it would be best to solve the issue upstream. I strived to leave
the existing semantics intact as long as only up to one regular
column is part of the materialized view primary key, which is the case
for Scylla's materialized views. For alternator it may not be true,
but all regular columns in alternator share liveness info (since
alternator does not support per-column TTL), which is sufficient
to compute view updates in a consistent way.
Fixes#5006
Tests: unit(dev), alternator(test_gsi_update_second_regular_base_column, tic-tac-toe demo)"
Piotr Sarna (3):
db,view: fix checking if partition key is empty
view: handle multiple regular base columns in view pk
test: add a case for multiple base regular columns in view key
alternator-test/test_gsi.py | 1 -
view_info.hh | 5 +-
cql3/statements/alter_table_statement.cc | 2 +-
db/view/view.cc | 77 ++++++++++++++----------
mutation_partition.cc | 2 +-
test/boost/cql_query_test.cc | 58 ++++++++++++++++++
6 files changed, 109 insertions(+), 36 deletions(-)
Merged patch series from Gleb Natapov:
"LWT is much more efficient if a request is processed on a shard that owns
a token for the request. This is because otherwise the processing will
bounce to an owning shard multiple times. The patch proposes a way to
move request to correct shard before running lwt. It works by returning
an error from lwt code if a shard is incorrect one specifying the shard
the request should be moved to. The error is processed by the transport
code that jumps to a correct shard and re-process incoming message there.
The nicer way to achieve the same would be to jump to a right shard
inside of the storage_proxy::cas(), but unfortunately with current
implementation of the modification statements they are unusable by
a shard different from where it was created, so the jump should happen
before a modification statement for an cas() is created. When we fix our
cql code to be more cross-shard friendly this can be reworked to do the
jump in the storage_proxy."
Gleb Natapov (4):
transport: change make_result to takes a reference to cql result
instead of shared_ptr
storage_service: move start_native_transport into a thread
lwt: Process lwt request on a owning shard
lwt: drop invoke_on in paxos_state prepare and accept
auth/service.hh | 5 +-
message/messaging_service.hh | 2 +-
service/client_state.hh | 30 +++-
service/paxos/paxos_state.hh | 10 +-
service/query_state.hh | 6 +
service/storage_proxy.hh | 2 +
transport/messages/result_message.hh | 20 +++
transport/messages/result_message_base.hh | 4 +
transport/request.hh | 4 +
transport/server.hh | 25 ++-
cql3/statements/batch_statement.cc | 6 +
cql3/statements/modification_statement.cc | 6 +
cql3/statements/select_statement.cc | 8 +
message/messaging_service.cc | 2 +-
service/paxos/paxos_state.cc | 48 ++---
service/storage_proxy.cc | 47 ++++-
service/storage_service.cc | 120 +++++++------
test/boost/cql_query_test.cc | 1 +
thrift/handler.cc | 3 +
transport/messages/result_message.cc | 5 +
transport/server.cc | 203 ++++++++++++++++------
21 files changed, 377 insertions(+), 180 deletions(-)
Use `seastar::memory::local_failure_injector()` to inject al possible
`std::bad_alloc`:s into the collection serialization code path. The test
just checks that there are no `std::abort()`:s caused by any of the
exceptions.
The test will not be run if `SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION` is
not defined.
`cell::make_collection()` assumes that all ranges passed to it are
nothrow copyable and movable views. This is not guaranteed, is not
expressed in the interface and is not mentioned in the comments either.
The changes introduced by 0a453e5d3a to collection serialization, making
it use fragmented buffers, fell into this trap, as it passes
`bytes_ostream` to `cell::make_collection()`. `bytes_ostream`'s copy
constructor allocates and hence can throw, triggering an
`std::terminate()` inside `cell::make_collection()` as the latter is
noexcept.
To solve this issue, non-nothrow copyable and movable ranges are now
wrapped in a `fragment_range_view` to make them so.
`cell::make_collection()` already requires callers to keep alive the
range for the duration of the call, so this does not introduce any new
requirements to the callers. Additionally, to avoid any future
accidents, do not accept temporaries for the `data` parameter. We don't
ever want to move this param anyway, we will either have a trivially
copyable view, or a potentially heavy-weight range that we will create a
trivially copyable view of.
A lightweight, trivially copyable and movable view for fragment ranges.
Allows for uniform treatment of all kinds of ranges, i.e. treating all
of them as a view. Currently `fragment_range.hh` provides lightweight,
view-like adaptors for empty and single-fragment ranges (`bytes_view`). To
allow code to treat owning multi-fragment ranges the shame way as the
former two, we need a view for the latter as well -- this is
`fragment_range_view`.
Resolves#4820. Execution path in main.cc now cleans up system.clients
table if it exists (this is done on startup). Also, server.cc now calls
functions that notify about cql clients connecting/disconnecting.
This simplifies the storage_service API and fixes the
complain about shared_ptr usage instead of unique_ptr.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
There's a lonely get_load_map() call on storage_service that
needs only load broadcaster, always runs on shard 0 and that's it.
Next patch will move this whole stuff into its own helper no-shard
container and this is preparation for this.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Since lwt requests are now running on an owning shard there is no longer
a need to invoke cross shard call on paxos_state level. RPC calls may
still arrive to a wrong shard so we need to make cross shard call there.
LWT is much more efficient if a request is processed on a shard that owns
a token for the request. This is because otherwise the processing will
bounce to an owning shard multiple times. The patch proposes a way to
move request to correct shard before running lwt. It works by returning
an error from lwt code if a shard is incorrect one specifying the shard
the request should be moved to. The error is processed by transport code
that jumps to a correct shard and re-process incoming message there.
"
The original fix (10f6b125c8) didn't
take into account that if there was a failed memtable flush (Refs
flush) but is not a flushable memtable because it's not the latest in
the memtable list. If that happens, it means no other memtable is
flushable as well, cause otherwise it would be picked due to
evictable_occupancy(). Therefore the right action is to not flush
anything in this case.
Suspected to be observed in #4982. I didn't manage to reproduce after
triggering a failed memtable flush.
Fixes#3717
"
* tag 'avoid-ooming-with-flush-continuations-v2' of github.com:tgrabiec/scylla:
database: Avoid OOMing with flush continuations after failed memtable flush
lsa: Introduce operator bool() to occupancy_stats
lsa: Expose region_impl::evictable_occupancy in the region class
"
Fix overflow handling in sum() and avg().
sum:
- aggregated into __int128
- detect overflow when computing result and log a warning if found
avg:
- fix division function to divide the accumulator type _sum (__int128 for integers) by _count
Add unit tests for both cases
Test:
- manual test against Cassandra 3.11.3 to make sure the results in the scylla unit test agree with it.
- unit(dev), cql_query_test(debug)
Fixes#5536
"
* 'cql3-sum-overflow' of https://github.com/bhalevy/scylla:
test: cql_query_test: test avg overflow
cql3: functions: protect against int overflow in avg
test: cql_query_test: test sum overflow
cql3: functions: detect and handle int overflow in sum
exceptions: sort exception_code definitions
exceptions: define additional cassandra CQL exceptions codes
"
We were failing to start a thread when the UDF call was nested in an
aggregate function call like SUM.
"
* 'espindola/fix-sum-of-udf' of https://github.com/espindola/scylla:
cql3: Fix indentation
cql3: Add missing with_thread_if_needed call
cql3: Implement abstract_function_selector::requires_thread
remove make_ready_future call
This was initialized to api::missing_timestamp but
should be set to either a client provided-timestamp or
the server's.
Unlike write operations, this timestamp need not be unique
as the one generated by client_state::get_timestamp.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200108074021.282339-2-bhalevy@scylladb.com>
exec->_cmd->read_timestamp may be initialized by default to api::min_timestamp,
causing:
service/storage_proxy.cc:3328:116: runtime error: signed integer overflow: 1577983890961976 - -9223372036854775808 cannot be represented in type 'long int'
Aborting on shard 1.
Do not optimize cross-dc repair if read_timestamp is missing (or just negative)
We're interested in reads that happen within write_timeout of a write.
Fixes#5556
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200108074021.282339-1-bhalevy@scylladb.com>
The cdc service is assigned from outside, post construction, mainly
because of the chickens and eggs in main startup. Would be nice to
have it unconditionally, but this is workable.
It is (and shall) only be called from inside storage proxy,
and we would like this to be reflected in the interface
so our eventual moving of cdc logic into the mutate call
chains become easier to verify and comprehend.
To eventually replace the free function.
Main difference is this is build to both handle batches correctly
and to eventually allow hanging cdc object on storage proxy,
and caches on the cdc object.
The test case checks that having two base regular columns
in the materialized view key (not obtainable via CQL),
still works fine when values are inserted or deleted.
If TTL was involved and these columns would have different expiration
rules, the case would be more complicated, but it's not possible
for a user to reach that case - neither with CQL, nor with alternator.
Previous assumption was that there can only be one regular base column
in the view key. The assumption is still correct for tables created
via CQL, but it's internally possible to create a view with multiple
such columns - the new assumption is that if there are multiple columns,
they share their liveness.
This patch is vital for indexing to work properly on alternator,
so it would be best to solve the issue upstream. I strived to leave
the existing semantics intact as long as only up to one regular
column is part of the materialized view primary key, which is the case
for Scylla's materialized views. For alternator it may not be true,
but all regular columns in alternator share liveness info (since
alternator does not support per-column TTL), which is sufficient
to compute view updates in a consistent way.
Fixes#5006
Tests: unit(dev), alternator(test_gsi_update_second_regular_base_column, tic-tac-toe demo)
Message-Id: <c9dec243ce903d3a922ce077dc274f988bcf5d57.1567604945.git.sarna@scylladb.com>
Now that position_in_partition_view has type-aware printing, use it
to provide a human readable version of clustering keys.
Message-Id: <20191231151315.602559-2-avi@scylladb.com>
If the position_in_partition_view represents a clustering key,
we can now see it with the clustering key decoded according to
the schema.
Message-Id: <20191231151315.602559-1-avi@scylladb.com>
Previous implementation did not take into account that a column
in a partition key might exist in a mutation, but in a DEAD state
- if it's deleted. There are no regressions for CQL, while for
alternator and its capability of having two regular base columns
in a view key, this additional check must be performed.
This reduces code bloat and makes the code friendlier for IDEs, as the
IDE now understands the type of create_schema.
Message-Id: <20191231134803.591190-1-avi@scylladb.com>
sstables::write_simple() has quite a lot of boilerplate
which gets replicated into each template instance. Move
all of that into a non-template do_write_simple(), leaving
only things that truly depend on the component being written
in the template, and encapsulating them with a
noncopyable_function.
An explicit template instantiation was added, since this
is used in a header file. Before, it likely worked by
accident and stopped working when the template became
small enough to inline.
Tests: unit (dev)
Message-Id: <20200106135453.1634311-1-avi@scylladb.com>
mutation_partition_view now supports a compile-time resolved visitor.
This is performant but results in bloat when the performance is not
needed. Furthermore, the template function that applies the object
to the visitor is private and out-of-line, to reduce compile time.
To allow visitation on mutation_partition_view objects, add a virtual
visitor type and a non-template accept function.
Note: mutation_partition_visitor is very similar to the new type,
but different enough to break the template visitor which is used
to implement the new visitor.
The new visitor will be used to implement pretty printing for
canonical_mutation.
Consider this:
1) Write partition_start of p1
2) Write clustering_row of p1
3) Write partition_end of p1
4) Repair is stopped due to error before writing partition_start of p2
5) Repair calls repair_row_level_stop() to tear down which calls
wait_for_writer_done(). A duplicate partition_end is written.
To fix, track the partition_start and partition_end written, avoid
unpaired writes.
Backports: 3.1 and 3.2
Fixes: #5527
commit 21dec3881c introduced
a bug that will cause scylla debian build to fail. This is
because the commit relied on the environment PRODUCT variable
to be exported (and as a result, to propogate to the rename
command that is executed by find in a subshell)
This commit fixes it by explicitly passing the PRODUCT variable
into the rename command.
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Message-Id: <20200106102229.24769-1-eliransin@scylladb.com>
In scylla, the replacing node is set as HIBERNATE status. It is the only
place we use HIBERNATE status. The replacing node is supposed to be
alive and updating its heartbeat, so it is not supposed to be in dead
state.
This patch fixes the following problem in replacing.
1) start n1, n2
2) n2 is down
3) start n3 to replace n2, but kill n3 in the middle of the replace
4) start n4 to replace n2
After step 3 and step 4, the old n3 will stay in gossip forever until a
full cluster shutdown. Note n3 will only stay in gossip but in
system.peers table. User will see the annoying and infinite logs like on
all the nodes
rpc - client $ip_of_n3:7000: fail to connect: Connection refused
Fixes: #5449
Tests: replace_address_test.py + manual test
VERSION_ID of centos7 is "7", but VERSION_ID of oel7.7 is "7.7"
scylla_ntp_setup doesn't work on OEL7.7 for ValueError.
- ValueError: invalid literal for int() with base 10: '7.7'
This patch changed redhat_version() to return version string, and compare
with parse_version().
Fixes#5433
Signed-off-by: Amos Kong <amos@scylladb.com>
When the progress is queried, e.g., query from nodetool netstats
the progress info might not be updated yet.
Fix it by checking before access the map to avoid errors like:
std::out_of_range (_Map_base::at)
Fixes: #5437
Tests: nodetool_additional_test.py:TestNodetool.netstats_test
This depends on the just emailed fixes to undefined behavior in
tests. With this change we should quickly notice if a change
introduces undefined behavior.
Fixes#4054
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191230222646.89628-1-espindola@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5538 from
Avi Kivity and Piotr Jastrzębski.
This series prepares CDC for rolling upgrade. This consists of
reducing the footprint of cdc, when disabled, on the schema, adding
a cluster feature, and redacting the cdc column when transferring
it to other nodes. The latter is needed because we'll want to backport
this to 3.2, which doesn't have canonical_mutations yet.
If in memory buffer has not enough space for incoming mutation it is
written into a file, but the code missed updating timestamp of a last
sync, so we may sync to often.
Message-Id: <20200102155049.21291-9-gleb@scylladb.com>
The code that enters the gate never defers before leaving, so the gate
behaves like a flag. Lets use existing flag to prohibit adding data to a
closed segment.
Message-Id: <20200102155049.21291-8-gleb@scylladb.com>
Currently segment closing code is spread over several functions and
activated based on the _closed flag. Make segment closing explicit
by moving all the code into close() function and call it where _closed
flag is set.
Message-Id: <20200102155049.21291-6-gleb@scylladb.com>
Currently sync() does two completely different things based on the
shutdown parameter. Separate code into two different function.
Message-Id: <20200102155049.21291-3-gleb@scylladb.com>
The original "test_schema_digest_does_not_change" test case ensures
that schema digests will match for older nodes that do not support
all the features yet (including computed columns).
The additional case uses sstables generated after CDC was enabled
and a table with CDC enabled is created,
in order to make sure that the digest computed
including CDC column does not change spuriously as well.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Addition of cdc column in scylla_tables changes how schema
digests are calculated, and affect the ABI of schema update
messages (adding a column changes other columns' indexes
in frozen_mutation).
To fix this, extend the schema_tables mechanism with support
for the cdc column, and adjust schemas and mutations to remove
that column when sending schemas during upgrade.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
An empty cdc column in scylla_tables is hashed differently from
a missing column. This causes schema mismatch when a schema is
propagated to another node, because the other node will redact
the schema column completely if the cluster feature isn't enabled,
and an empty value is hashed differently from a missing value.
Store a tombstone instead. Tombstones are removed before
digesting, so they don't affect the outcome.
This change also undoes the changes in 386221da84 ("schema_tables:
handle 'cdc' options") to schema_change_test
test_merging_does_not_alter_tables_which_didnt_change. That change
enshrined the breakage into the test, instead of fixing the root cause,
which was that we added an an extra mutation to the schema (for
cdc options, which were disabled).
Different versions of boost have different rules for what conversions
from cpp_int to smaller intergers are allowed.
We already had a function that worked with all supported versions, but
it was not being use by lua.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200104041028.215153-1-espindola@scylladb.com>
I noticed this while looking at the crashes next is currently
experiencing.
While I have no idea if this fixes the issue, it does avoid broken
future warnings (for no_sharded_instance_exception) in a debug build.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200103201540.65324-1-espindola@scylladb.com>
* seastar 0525bbb08...36cf5c5ff (6):
> memcached: Fix use after free in shutdown
> Revert "task: stop wrapping tasks with unique_ptr"
> task: stop wrapping tasks with unique_ptr
> http: Change exception formating to the generic seastar one
> Merge "Avoid a few calls to ~exception_ptr" from Rafael
> tests: fix core generation with asan
This patch adds a very comprehensive test for the ConditionExpression
feature, i.e., the newer syntax of conditional writes replacing
the old-style "Expected" - for the UpdateItem, PutItem and DeleteItem
operations.
I wrote these tests while closely following the DynamoDB ConditionExpression
documentation, and attempted to cover all conceivable features, subfeatures
and subcases of the ConditionExpression syntax - to serve as a test for a
future support for this feature in Alternator (see issue #5053).
As usual, all these tests pass on AWS DynamoDB, but because we haven't yet
implemented this feature in Alternator, all but one xfail on Alternator.
Refs #5053.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191229143556.24002-1-nyh@scylladb.com>
If Alternator is requested to be enabled on a specific port but the port is
already taken, the boot fails as expected - but the error log is confusing;
It currently looks something like this:
WARN 2019-12-24 11:22:57,303 [shard 0] alternator-server - Failed to set up Alternator HTTP server on 0.0.0.0 port 8000, TLS port 8043: std::system_error (error system:98, posix_listen failed for address 0.0.0.0:8000: Address already in use)
... (many more messages about the server shutting down)
INFO 2019-12-24 11:22:58,008 [shard 0] init - Startup failed: std::system_error (error system:98, posix_listen failed for address 0.0.0.0:8000: Address already in use)
There are two problems here. First, the "WARN" should really be an "ERROR",
because it causes the server to be shut down and the user must see this error.
Second, the final line in the log, something the user is likely to see first,
contains only the ultimate cause for the exception (an address already in use)
but not the information what this address was needed for.
This patch solves both issues, and the log now looks like:
ERROR 2019-12-24 14:00:54,496 [shard 0] alternator-server - Failed to set up Alterna
tor HTTP server on 0.0.0.0 port 8000, TLS port 8043: std::system_error (error system
:98, posix_listen failed for address 0.0.0.0:8000: Address already in use)
...
INFO 2019-12-24 14:00:55,056 [shard 0] init - Startup failed: std::_Nested_exception<std::runtime_error> (Failed to set up Alternator HTTP server on 0.0.0.0 port 8000, TLS port 8043): std::system_error (error system:98, posix_listen failed for address 0.0.0.0:8000: Address already in use)
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191224124127.7093-1-nyh@scylladb.com>
We don't support yet the ReturnValues option on PutItem, UpdateItem or
DeleteItem operations (see issue #5053), but if a user tries to use such
an option anyway, we silently ignore this option. It's better to fail,
reporting the unsupported option.
In this patch we check the ReturnValues option and if it is anything but
the supported default ("NONE"), we report an error.
Also added a test to confirm this fix. The test verifies that "NONE" is
allowed, and something which is unsupported (e.g., "DOG") is not ignored
but rather causes an error.
Refs #5053.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191216193310.20060-1-nyh@scylladb.com>
These are flags we always want to enable. In particular, we want them
to be used by the bots, but the bots run this script with
--configure-flags, so they were being discarded.
We put the user option later so that they can override the common
options.
Fixes#5505
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Takuya ASADA <syuu@scylladb.com>
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
There is no requirement that all notes be placed in a single
PT_NOTE. It looks like recent lld's actually put each section in its
own PT_NOTE.
This change looks for build-id in all PT_NOTE headers.
Fixes#5525
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20191227000311.421843-1-espindola@scylladb.com>
rpm compression uses xz, which is painfully slow. Adjust the
compression settings to run on all threads.
The xz utility documentation suggests that 0 threads is
equivalent to all CPUs, but apparently the library interface
(which rpmbuild uses) doesn't think the same way.
Message-Id: <20200101141544.1054176-1-avi@scylladb.com>
In the current code, support for case-sensitive (quoted) user-defined type
names is broken. For example, a test doing:
CREATE TYPE "PHone" (country_code int, number text)
CREATE TABLE cf (pk blob, pn "PHone", PRIMARY KEY (pk))
Fails - the first line creates the type with the case-sensitive name PHone,
but the second line wrongly ends up looking for the lowercased name phone,
and fails with an exception "Unknown type ks.phone".
The problem is in cql3_type_name_impl. This class is used to convert a
type object into its proper CQL syntax - for example frozen<list<int>>.
The problem is that for a user-defined type, we forgot to quote its name
if not lowercase, and the result is wrong CQL; For example, a list of
PHone will be written as list<PHone> - but this is wrong because the CQL
parser, when it sees this expression, lowercases the unquoted type name
PHone and it becomes just phone. It should be list<"PHone">, not list<PHone>.
The solution is for cql3_type_name_impl to use for a user-defined type
its get_name_as_cql_string() method instead of get_name_as_string().
get_name_as_cql_string() is a new method which prints the name of the
user type as it should be in a CQL expression, i.e., quoted if necessary.
The bug in the above test was apparently caused when our code serialized
the type name to disk as the string PHone (without any quoting), and then
later deserialized it using the CQL type parser, which converted it into
a lowercase phone. With this patch, the type's name is serialized as
"PHone", with the quotes, and deserialized properly as the type PHone.
While the extra quotes may seem excessive, they are necessary for the
correct CQL type expression - remember that the type expression may be
significantly more complex, e.g., frozen<list<"PHone">> and all of this,
including the quotes, is necessary for our parser to be able to translate
this string back into a type object.
This patch may cause breakage to existing databases which used case-
sensitive user-defined types, but I argue that these use cases were
already broken (as demonstrated by this test) so we won't break anything
that actually worked before.
Fixes#5544
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200101160805.15847-1-nyh@scylladb.com>
The class in question wants to run its own instances on different
shards, for this sake it keeps reference on sharded self to call
invoke_on() on. There's a handy peering_sharded_service<> in seastar
for the same, using it makes the code nicer and shorter.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20191226112401.23960-1-xemul@scylladb.com>
We had a lot of code in a .hh file, that while using templeates, was
only used from creating functions during startup.
This moves it to a new .cc file.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200101002158.246736-1-espindola@scylladb.com>
I can't quite figure out how we were trying to write a sstable with
the large data handler already stopped, but the backtrace suggests a
good place to add extra checks.
This patch adds two check. One at the start and one at the end of
sstable::write_components. The first one should give us better
backtraces if the large_data_handler is already stopped. The second
one should help catch some race condition.
Refs: #5470
Message-Id: <20191231173237.19040-1-espindola@scylladb.com>
The standard printer for atomic_cell prints the value as hex,
because atomic_cell does not include the type. Add a type-aware
printer that allows the user to provide the type.
When the product name is other than "scylla", the debian
packaging scripts go over all files that starts with "scylla-"
and change the prefix to be the actual product name.
However, if there are no such files in the directory
the script will fail since the renaming command will
get the wildcard string instrad of an actual file name.
This patch replaces the command with a command with
an equivalent desired effect that only operates on files
if there are any.
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Message-Id: <20191230143250.18101-1-eliransin@scylladb.com>
Since we merged /usr/lib/scylla with /opt/scylladb, we removed
/usr/lib/scylla and replace it with the symlink point to /opt/scylladb.
However, RPM does not support replacing a directory with a symlink,
we are doing some dirty hack using RPM scriptlet, but it causes
multiple issues on upgrade/downgrade.
(See: https://docs.fedoraproject.org/en-US/packaging-guidelines/Directory_Replacement/)
To minimize Scylla upgrading/downgrade issues on user side, it's better
to keep /usr/lib/scylla directory.
Instead of creating single symlink /usr/lib/scylla -> /opt/scylladb,
we can create symlinks for each setup scripts like
/usr/lib/scylla/<script> -> /opt/scylladb/scripts/<script>.
Fixes#5522Fixes#4585Fixes#4611
The '--builddir' option value is assigned to the "builddir" variable,
which is wrong. The correct variable is "BUILDDIR" so use that instead
to fix the '--builddir' option.
Also, add logging to the script when executing the "dist/redhat_build.rpm.sh"
script to simplify debugging.
Hit the following ubsan error with bootstrap_test:TestBootstrap.manual_bootstrap_test in debug mode:
service/storage_service.cc:3519:37: runtime error: load of value 190, which is not a valid value for type 'bool'
The use site is:
service::storage_service::is_cleanup_allowed(seastar::basic_sstring<char, unsigned int, 15u, true>)::{lambda(service::storage_service&)#1}::operator()(service::storage_service&) const at /local/home/bhalevy/dev/scylla/service/storage_service.cc:3519
While at it, initialize `_initialized` to false as well, just in case.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Avoid following UBSAN error:
repair/row_level.cc:2141:7: runtime error: load of value 240, which is not a valid value for type 'bool'
Fixes#5531
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
A Linux machine typically has multiple clocksources with distinct
performances. Setting a high-performant clocksource might result in
better performance for ScyllaDB, so this should be considered whenever
starting it up.
This patch introduces the possibility of enforcing optimized Linux
clocksource to Scylla's setup/start-up processes. It does so by adding
an interactive question about enforcing clocksource setting to scylla_setup,
which modifies the parameter "CLOCKSOURCE" in scylla_server configuration
file. This parameter is read by perftune.py which, if set to "yes", proceeds
to (non persistently) setting the clocksource. On x86, TSC clocksource is used.
Fixes#4474Fixes#5474Fixes#5480
Instances of `variable_specifications` are passed around as
shared_ptr's, which are redundant in this case since the class
is marked as `final`. Use `lw_shared_ptr` instead since we know
for sure it's not a polymorphic pointer.
Tests: unit(debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20191225232853.45395-1-pa.solodovnikov@scylladb.com>
If any two directories of data/commitlog/hints/view_hints
are the same we still end up running verify_owner_and_mode
and disk_sanity(check_direct_io_support) in parallel
on the same directoriea and hit #5510.
This change uses std::set rather than std::vector to
collect a unique set of directories that need initialization.
Fixes#5510
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20191225160645.2051184-1-bhalevy@scylladb.com>
db::commitlog::segment::batch_cycle() assumes that after a write
for a certain position completes (as reported by
_pending_ops.wait_for_pending()) it will also be flushed, but this is
true only if writing and flushing are atomic wrt _pending_ops lock.
It usually is unless flush_after is set to false when cycle() is
called. In this case only writing is done under the lock. This
is exactly what happens when a segment is closed. Flush is skipped
because zero header is added after the last entry and then flushed, but
this optimization breaks batch_cycle() assumption. Fix it by flushing
after the write atomically even if a segment is being closed.
Fixes#5496
Message-Id: <20191224115814.GA6398@scylladb.com>
The hints and view_hints directory has per-shard sub-dirs,
and the directories code tries to create, check and lock
all of them, including the base one.
The manipulations in question are excessive -- it's enough
to check and lock either the base dir, or all the per-shard
ones, but not everything. Let's take the latter approach for
its simplicity.
Fixes#5510
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Looks-good-to: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20191223142429.28448-1-xemul@scylladb.com>
Pekka Enberg <penberg@scylladb.com> wrote:
> Image might not be present, but the subsequent "docker run" command will automatically pull it.
Just letting "docker run" fail produces kinda confusing error message,
referring to docker help, but the we want to provide the user
with our own help, so still fail early, just also try to pull the image
if "docker image inspect" failed, indicating it's not present locally.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20191223085219.1253342-4-bhalevy@scylladb.com>
Suggested-by: Pekka Enberg <penberg@scylladb.com>
> This will print all the available Docker images,
> many (most?) of them completely unrelated.
> Why not just print an error saying that no image was specified,
> and then perhaps print usage.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20191223085219.1253342-3-bhalevy@scylladb.com>
Add dbuild dependency on python3-colorama,
which will be used in test.py instead of a hand-made palette.
[avi: update tools/toolchain/image]
Message-Id: <20191223125251.92064-2-kostja@scylladb.com>
In this place we only need to know the number of endpoints,
while current code additionally shuffles them before counting.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
There are two _identical_ methods in token_metadata class:
get_all_endpoints_count() and number_of_endpoints().
The former one is used (called) the latter one is not used, so
let's remove it.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
This greatly helps to narrow down the source of schema digest mismatch
between nodes. Intented use is to enable this logger on disagreeing
nodes and trigger schema digest recalculation and observe which
mutations differ in digest and then examine their content.
Message-Id: <1574872791-27634-1-git-send-email-tgrabiec@scylladb.com>
In commit b463d7039c (repair: Introduce
get_combined_row_hash_response), working_row_buf_nr is returned in
REPAIR_GET_COMBINED_ROW_HASH in addition to the combined hash. It is
scheduled to be part of 3.1 release. However it is not backported to 3.1
by accident.
In order to be compatible between 3.1 and 3.2 repair. We need to drop
the working_row_buf_nr in 3.2 release.
Fixes: #5490
Backports: 3.2
Tests: Run repair in a mixed 3.1 and 3.2 cluster
Changes summary:
* make `cql3::result_set` movable-only
* change signature of `cql3::result::result_set` to return by cref
* adjust available call sites to the aforementioned method to accept cref
Motivation behind this change is elimination of dangerous API,
which can easily set a trap for developers who don't expect that
result_set would be returned by value.
There is no point in copying the `result_set` around, so make
`cql3::result::result_set` to cache `result_set` internally in a
`unique_ptr` member variable and return a const reference so to
minimize unnecessary copies here and there.
Tests: unit(debug)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20191220115100.21528-1-pa.solodovnikov@scylladb.com>
So a higher level component using the validator to validate a stream can
catch only validation errors, and let any other incidental exception
through.
This allows building data correctors on top of the
`mutation_fragment_stream_validator`, by filtering a fragment stream
through a validator, catching invalid fragment stream exceptions and
dropping the respective fragments from the stream.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191220073443.530750-1-bdenes@scylladb.com>
This reverts commit 237ba74743. While it
works for the scylla executable, it fails for iotune, which is built
by seastar. It should be reinstated after we pass the correct link
parameters to the seastar build system.
"
These series solves an issue with scylla_setup and prevent it from
waiting forever if housekeeping cannot look for the new Scylla version.
Fixes#5302
It should be backported to versions that support offline installations.
"
* 'scylla_setup_timeout' of git://github.com/amnonh/scylla:
scylla_setup: do not wait forever if no reply is return housekeeping
scylla_util.py: Add optional timeout to out function
Having a long path allows patchelf to change the interpreter without
changing the PT_LOAD headers and therefore without moving the
build-id out of the first page.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191213224803.316783-1-espindola@scylladb.com>
Suppose we have a multi-dc setup (e.g. 9 nodes distributed across
3 datacenters: [dc1, dc2, dc3] -> [3, 3, 3]).
When a query that uses LWT is executed with LOCAL_SERIAL consistency
level, the `storage_proxy::get_paxos_participants` function
incorrectly calculates the number of required participants to serve
the query.
In the example above it's calculated to be 5 (i.e. the number of
nodes needed for a regular QUORUM) instead of 2 (for LOCAL_SERIAL,
which is equivalent to LOCAL_QUORUM cl in this case).
This behavior results in an exception being thrown when executing
the following query with LOCAL_SERIAL cl:
INSERT INTO users (userid, firstname, lastname, age) VALUES (0, 'first0', 'last0', 30) IF NOT EXISTS
Unavailable: Error from server: code=1000 [Unavailable exception] message="Cannot achieve consistency level for cl LOCAL_SERIAL. Requires 5, alive 3" info={'required_replicas': 5, 'alive_replicas': 3, 'consistency': 'LOCAL_SERIAL'}
Tests: unit(dev), dtest(consistency_test.py)
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20191216151732.64230-1-pa.solodovnikov@scylladb.com>
The actual buffer is now in a member called 'data'. Leave the old
`dummy.dummy` and `dummy` as fall-back. This seems to change every
Fedora release.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191218153544.511421-1-bdenes@scylladb.com>
Schema is node-global, update_schema_version_and_announce() updates
all shards. We don't need to recalculate it from every shard, so
install the listeners only on shard 0. Reduces noise in the logs.
Message-Id: <1574872860-27899-1-git-send-email-tgrabiec@scylladb.com>
The option in question apparently does not work, several sharded objects
are start()-ed (and thus instanciated) in join_roken_ring, while instances
themselves of these objects are used during init of other stuff.
This leads to broken seastar local_is_initialized assertion on sys_dist_ks,
but reading the code shows more examples, e.g. the auth_service is started
on join, but is used for thrift and cql servers initialization.
The suggestion is to remove the option instead of fixing. The is_joined
logic is kept since on-start joining still can take some time and it's safer
to report real status from the API.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20191203140717.14521-1-xemul@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5366 from Calle Wilund:
Moves schema creation/alter/drop awareness to use new "before" callbacks from
migration manager, and adds/modifies log and streams table as part of the base
table modification.
Makes schema changes semi-atomic per node. While this does not deal with updates
coming in before a schema change has propagated cluster, it now falls into the
same pit as when this happens without CDC.
Added side effect is also that now schemas are transparent across all subsystems,
not just cql.
Patches:
cdc_test: Add small test for altering base schema (add column)
cdc: Handle schema changes via migration manager callbacks
migration_manager: Invoke "before" callbacks for table operations
migration_listener: Add empty base class and "before" callbacks for tables
cql_test_env: Include cdc service in cql tests
cdc: Add sharded service that does nothing.
cdc: Move "options" to separate header to avoid to much header inclusion
cdc: Remove some code from header
* seastar 00da4c8760...0525bbb08f (7):
> future: Simplify future_state_base::any move constructor
> future: don't create temporary tuple on future::get().
> future: don't instantiate new future on future::then_wrapped().
> future: clean-up the Result handling in then_wrapped().
> Merge "Fix core dumps when asan is enabled" from Rafael
> future: Move ignore to the base class
> future: Don't delete in ignore
Currently `SCYLLA-VERSION-GEN` is not a dependency of any target and
hence changes done to it will not be picked up by ninja. To trigger a
rebuild and hence version changes to appear in the `scylla` target
binary, one has to do `touch configure.py`. This is counter intuitive
and frustrating to people who don't know about it and wonder why their
changed version is not appearing as the output of `scylla --version`.
This patch makes `SCYLLA-VERSION-GEN` a dependency of `build.ninja,
making the `build.ninja` target out-of-date whenever
`SCYLLA-VERSION-GEN` is changed and hence will trigger a rerun of
`configure.py` when the next target is built, allowing a build of e.g.
`scylla` to pick up any changes done to the version automatically.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191217123955.404172-1-bdenes@scylladb.com>
"
This patch set rearranges the test files so that
it is now possible to search for tests automatically,
and adds this functionality to test.py
"
* 'test.py.requeue' of ssh://github.com/scylladb/scylla-dev:
cmake: update CMakeLists.txt to scan test/ rather than tests/
test.py: automatically lookup all unit and boost tests
tests: move all test source files to their new locations
tests: move a few remaining headers
tests: move another set of headers to the new test layout
tests: move .hh files and resources to new locations
tests: remove executable property from data_listeners_test.cc
When scylla is installed without a network connectivity, the test if a
newer version is available can cause scylla_setup to wait forever.
This patch adds a limit to the time scylla_setup will wait for a reply.
When there is no reply, the relevent error will be shown that it was
unable to check for newer version, but this will not block the setup
script.
Fixes#5302
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5343 from
Benny Halevy.
Fixes#5340
Hold the sstable_deletion_sem table::move_sstables_from_subdirs to
serialize access to the staging directory. It now synchronizes snapshot,
compaction deletion of sstables, and view_update_generator moving of
sstables from staging.
Tests:
unit (dev) [expect test_user_function_timestamp_return that fails for me locally, but also on master]
snapshot_test.py (dev)
I used the following as a reference:
https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/db/virtual/ClientsTable.java
At this moment there is only info about IP, clients outgoing port,
client 'type' (i.e. CQL/thrift/alternator), shard ID and username.
Column `request_count' is NOT present and CK consists of
(`port', `client_type'), contrary to what C*'s has: (`port').
Code that notifies `system.clients` about new connections goes
to top-level files `connection_notifier.*`. Currently only CQL
clients are observed, but enum `client_type` can be used in future
to notify about connections with other protocols.
Hold the _sstable_deletion_sem while moving sstables from the staging directory
so not to move them under the feet of table::snapshot.
Fixes#5340
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Consumer may throw, in this case, break from the loop and retry.
move_sstable_from_staging_in_thread may theoretically throw too,
ignore the error in this case since the sstable was already processed,
individual move failures are already ignored and moving from staging
will be retried upon restart.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
To be used for "batch" move of several sstables from staging
to the base directory, allowing the caller to sync the directories
once when all are moved rather than for each one of them.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
distributed_loader::probe_file needlessly creates a seastar
thread for it and the next patch will use it as part of
a parallel_for_each loop to move a list of sstables
(and sync the directories once at the end).
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
We do not yet support the parallel Scan options (TotalSegments, Segment),
as reported in issue #5059. But even before implementing this feature, it
is important that we produce an error if a user attempts to use it - instead
of outright ignoring this parameter. This is what this patch does.
The patch also adds a full test, test_scan.py::test_scan_parallel, for the
parallel scan feature. The test passes on DynamoDB, and still xfails
on Alternator after this patch - but now the Scan request fails immediately
reporting the unsupported option - instead of what the pre-patch code did:
returning the wrong results and the test failing just when the results
do not match the expectations.
Refs #5059.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191217084917.26191-1-nyh@scylladb.com>
"
Only the first patch is needed to fix the undefined behavior, but the
followup ones simplify the memory management around user types.
"
* 'espindola/fix-5193-v2' of ssh://github.com/espindola/scylla:
db: Don't use lw_shared_ptr for user_types_metadata
user_types_metadata: don't implement enable_lw_shared_from_this
cql3: pass a const user_types_metadata& to prepare_internal
db: drop special case for top level UDTs
db: simplify db::cql_type_parser::parse
db: Don't create a reference to nullptr
Add test for loading a schema with a non native type
1. Move tests to test (using singular seems to be a convention
in the rest of the code base)
2. Move boost tests to test/boost, other
(non-boost) unit tests to test/unit, tests which are
expected to be run manually to test/manual.
Update configure.py and test.py with new paths to tests.
Move sstable_test.hh, test_table.hh and cql_assertions.hh from tests/ to
test/lib or test/boost and update dependent .cc files.
Move tests/perf_sstable.hh to test/perf/perf_sstable.hh
Move another small subset of headers to test/
with the same goals:
- preserve bisectability
- make the revision history traceable after a move
Update dependent files.
The plan is to move the unstructured content of tests/ directory
into the following directories of test/:
test/lib - shared header and source files for unit tests
test/boost - boost unit tests
test/unit - non-boost unit tests
test/manual - tests intended to be run manually
test/resource - binary test resources and configuration files
In order to not break git bisect and preserve the file history,
first move most of the header files and resources.
Update paths to these files in .cc files, which are not moved.
We're seeing the following error from test from time to time:
fatal error: in "test_allocation_failure": std::runtime_error: Did not get expected exception from writing too large record
This is not reproducible and the error string does not contain
enough information to figure out what happened exactly, therefore
this patch adds an exception if the call succeeded unexpectedly
and also prints the unexpected exception if one was caught.
Refs #4714
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20191215052434.129641-1-bhalevy@scylladb.com>
The is_podman check was depending on `docker -v` printing "podman" in
the output, but that doesn't actually work, since podman prints $0.
Use `docker --help` instead, which will output "podman".
Also return podman's return status, which was previously being
dropped.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
"
On start there are two things that scylla does on data/commitlog/etc.
dirs: locks and verifies permissions. Right now these two actions are
managed by different approaches, it's convenient to merge them.
Also the introduced in this set directories class makes a ground for
better --workdir option handling. In particular, right now the db::config
entries are modified after options parse to update directories with
the workdir prefix. With the directories class at hands will be able
to stop doing this.
"
* 'br-directories-cleanup' of https://github.com/xemul/scylla:
directories: Make internals work on fs::path
directories: Cleanup adding dirs to the vector to work on
directories: Drop seastar::async usage
directories: Do touch_and_lock and verify sequentially
directories: Do touch_and_lock in parallel
directories: Move the whole stuff into own .cc file
directories: Move all the dirs code into .init method
file_lock: Work with fs::path, not sstring
This reverts commit 4333b37f9e. It breaks upgrades,
and the user question is not informative enough for the user to make a correct
decision.
Fixes#5478.
Fixes#5480.
The unordered_set is turned into vector since for fs::path
there's no hash() method that's needed for set.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Now the only future-able operation remained is the call to
parallel_for_each(), all the rest is non-blocking preparation,
so we can drop the seastar::async and just return the future
from parallel_for_each.
The indendation is now good, as in previous patch is was prepared
just for that.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The goal is to drop the seastar::async() usage.
Currently we have two places that return futures -- calls to
parallel_for_each-s. We can either chain them together or,
since both are working on the same set of directories, chain
actions inside them.
For code simplicity I propose to chain actions.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The list of paths that should be touch-and-locked is already
at hands, this shortens the code and makes it slightly faster
(in theory).
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
In order not to pollute the root dir place the code in
utils/ directory, "utils" namespace.
While doing this -- move the touch_and_lock from the
class declaration.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The seastar::async usage is tempoarary, added for bisect-safety,
soon it will go away. For this reason the indentation in the
.init method is not "canonical", but is prepared for one-patch
drop of the seastar::async.
The hinted_handoff_enabled arg is there, as it's not just a
parameter on config, it had been parsed in main.cc.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
The main.cc code that converts sstring to fs::path
will be patched soon, the file_desc::open belongs
to seastar and works on sstrings.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
"
LWT is much more efficient if a request is processed on a shard that owns
a token for the request. This is because otherwise the processing will
bounce to an owning shard multiple times. The patch proposes a way to
move request to correct shard before running lwt. It works by returning
an error from lwt code if a shard is incorrect one specifying the shard
the request should be moved to. The error is processed by the transport
code that jumps to a correct shard and re-process incoming message there.
"
* 'gleb/bounce_lwt_request' of github.com:scylladb/seastar-dev:
lwt: take raw lock for entire cas duration
lwt: drop invoke_on in paxos_state prepare and accept
lwt: Process lwt request on a owning shard
storage_service: move start_native_transport into a thread
transport: change make_result to takes a reference to cql result instead of shared_ptr
The implementation of Expected's BEGINS_WITH operator on blobs was
incorrect, naively comparing the base64-encoded strings, which doesn't
work. This patches fixes the code to compare the decoded strings.
The reason why the BEGINS_WITH test missed this bug was that we forgot
to check the blob case and only tested the string case; So this patch
also adds the missing test - which reproduces this bug, and verifies
its fix.
Fixes#5457
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191211115526.29862-1-nyh@scylladb.com>
The LIKE operator requires filtering, so needs_filtering() must check
is_LIKE(). This already happens for partition columns, but it was
overlooked for clustering columns in the initial implementation of
LIKE.
Fixes#5400.
Tests: unit(dev)
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
To be used by dtest as an indicator that endpoint's hints
were drained and hints directory is removed.
Refs #5354
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
The user_types_metadata can simply be owned by the keyspace. This
simplifies the code since we never have to worry about nulls and the
ownership is now explicit.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
It looks like this was done just to avoid including
user_types_metadata.hh, which seems a bit much considering that it
requires adding specialization to the seastar namespace.
A followup patch will also stop using lw_shared_ptr for
user_types_metadata.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
We never modify the user_types_metadata via prepare_internal, so we
can pass it a const reference.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This was originally done in 7f64a6ec4b,
but that commit was reverted in reverted in
8517eecc28.
The revert was done because the original change would call parse_raw
for non UDT types. Unlike the old patch, this one doesn't change the
behavior of non UDT types.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The variant of db::cql_type_parser::parse that has a
user_types_metadata argument was only used from the variant that
didn't. This inlines one in the other.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The user_types variable can be null during db startup since we have to
create types before reading the system table defining user types.
This avoids undefined behavior, but is unlikely that it was causing
more serious problems since the variable is only used when creating
user types and we don't create any until after all system tables are
read, in which case the user_types variable is not null.
Fixes#5193
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
LWT is much more efficient if a request is processed on a shard that owns
a token for the request. This is because otherwise the processing will
bounce to an owning shard multiple times. The patch proposes a way to
move request to correct shard before running lwt. It works by returning
an error from lwt code if a shard is incorrect one specifying the shard
the request should be moved to. The error is processed by transport code
that jumps to a correct shard and re-process incoming message there.
This patch adds comprehensive tests for the ReturnValue parameter of
the write operations (PutItem, UpdateItem, DeleteItem), which can return
pre-write or post-write values of the modified item. The tests are in
a new test file, alternator-test/test_returnvalues.py.
This feature is not yet implemented in Alternator, so all the new
tests xfail on Alternator (and all pass on AWS).
Refs #5053
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191127163735.19499-1-nyh@scylladb.com>
This patch adds tests for Query's "ScanIndexForward" parameter, which
can be used to return items in reversed sort order.
We test that a Limit works and returns the given number of *last* items
in the sort order, and also that such reverse queries can be resumed,
i.e., paging works in the reverse order.
These tests pass against AWS DynamoDB, but fail against Alternator (which
doesn't support ScanIndexForward yet), so it is marked xfail.
Refs #5153.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191127114657.14953-1-nyh@scylladb.com>
The JMX interface is implemented by the scylla-jmx project, not scylla.
Therefore, let's remove this historical reference to MBeans from
storage_proxy.
Message-Id: <20191211121652.22461-1-penberg@scylladb.com>
"
Add --experimental-features -- a vector of features to unlock. Make corresponding changes in the YAML parser.
Fixes#5338
"
* 'vecexper' of https://github.com/dekimir/scylla:
config: Add `experimental_features` option
utils: Add enum_option
* seastar e440e831c8...00da4c8760 (7):
> Merge "reactor: fix iocb pool underflow due to unaccounted aio fsync" from Avi
Fixes#5443.
> install-dependencies.sh: fix arch dependencies
> Merge " rpc: fix use-after-free during rpc teardown vs. rpc server message handling" from Benny
> Merge "testing: improve the observability of abandoned failed futures" from Botond
> rework the fair_queue tester
> directory_test: Update to use run instead of run_deprecated
> log: support fmt 6.0 branch with chrono.h for log
In the calculate_delay() code for view-backlog flow control, we calculate
a delay and cap it at a "budget" - the remaining timeout. This timeout is
measured in milliseconds, but the capping calculation converted it into
microseconds, which overflowed if the timeout is very large. This causes
some tests which enable the UB sanitizer to fail.
We fix this problem by comparing the delay to the budget in millisecond
resolution, not in microsecond resolution. Then, if the calculated delay
is short enough, we return it using its full microsecond resolution.
Fixes#5412
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191205131130.16793-1-nyh@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5453
from Piotr Sarna:
Checking the EQ relation for alternator attributes is usually performed
simply by comparing underlying JSON objects, but sets (SS, BS, NS types)
need a special routine, as we need to make sure that sets stored in
a different order underneath are still equal, e.g:
[1, 3, 2] == [1, 2, 3]
Fixes#5021
Checking the EQ relation for alternator attributes is usually performed
simply by comparing underlying JSON objects, but sets (SS, BS, NS types)
need a special routine, as we need to make sure that sets stored in
a different order underneath are still equal, e.g:
[1, 3, 2] == [1, 2, 3]
Fixes#5021
If a set of mutations contains both an entry that deletes a table
and an entry that adds a table with the same name, it's expected
to be a replacement operation (delete old + create new),
rather than a useless "try to create a table even though it exists
already and then immediately delete the original one" operation.
As such, notifications about the deletions should be performed
before notifications about the creations. The place that originally
suffered from this wrong order is view building - which in this case
created an incorrect duplicated entry in the view building bookkeeping,
and then immediately deleted it, resulting in having old, deprecated
entries with stale UUIDS lying in the build queue and never proceeding,
because the underlying table is long gone.
The issue is fixed by ensuring the order of notifications:
- drops are announced first, view drops are announced before table drops;
- creations follow, table creations are announced before views;
- finally, changes to tables and views are announced;
Fixes#4382
Tests: unit(dev), mv_populating_from_existing_data_during_node_stop_test
Iterate over an array holding all rpm names to see if any
of them is missing from `dist/ami/files`. If they are missing,
look them up in build/redhat/RPMS/x86_64 so that if reloc/build_rpm.sh
was run manually before dist/ami/build_ami.sh we can just collect
the built rpms from its output dir.
If we're still missing any rpms, then run reloc/build_rpm.sh
and copy the required rpms from build/redhat/RPMS/x86_64.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Glauber Costa <glauber@scylladb.com>
In swagger 1.2 int is defined as int32.
We originally used int following the jmx definition, in practice
internally we use uint and int64 in many places.
While the API format the type correctly, an external system that uses
swagger-based code generator can face a type issue problem.
This patch replace all use of int in a return type with long that is defined as int64.
Changing the return type, have no impact on the system, but it does help
external systems that use code generator from swagger.
Fixes#5347
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
Provide some explanation on prio strings + direction to gnutls manual.
Document client auth option.
Remove confusing/misleading statement on "custom options"
Message-Id: <20191210123714.12278-1-calle@scylladb.com>
Enable existing NOT_CONTAINS test, add NOT_CONTAINS to the list of
recognized operators, implement check_NOT_CONTAINS, and hook it up to
verify_expected_one().
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
When the user wants to turn on only some experimental features, they
can use this new option. The existing `experimental` option is
preserved for backwards compatibility.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
Almost all commands provided by `scylla-gdb.py` are safe to use. The
worst that could happen if they fail is that you won't get the desired
information. There is one notable exception: `scylla thread`. If
anything goes wrong while this command is executed - gdb crashes, a bug
in the command, etc. - there is a good change the process under
examination will crash. Sometimes this is fine, but other times e.g.
when live debugging a production node, this is unacceptable.
To avoid any accidents add documentation to all commands working with
`seastar::thread`. And since most people don't read documentation,
especially when debugging under pressure, add a safety net to the
`scylla thread` command. When run, this command will now warn of the
dangers and will ask for explicit acknowledgment of the risk of crash,
by means of passing an `--iamsure` flag. When this flag is missing, it
will refuse to run. I am sure this will be very annoying but I am also
sure that the avoided crashes are worth it.
As part of making `scylla thread` safe, its argument parsing code is
migrated to `argparse`. This changes the usage but this should be fine
because it is well documented.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191129092838.390878-1-bdenes@scylladb.com>
The ami description attribute is only allowed to be 255
characters long. When build_ami.sh generates an ami, it
generates an ami description which is a concatenation
of all of the componnents version strings. It can
happen that the description string is too long which
eventually causes the ami build to fail. This patch
trims the description string to 255 characters.
It is ok since the individual versions of the components
are also saved in tags attached to the image.
Tests:
1. Reproduced with a long description and
validated that it doesn't fail after the fix.
Fixes#5435
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Message-Id: <20191209141143.28893-1-eliransin@scylladb.com>
A Linux machine typically has multiple clocksources with distinct
performances. Setting a high-performant clocksource might result in
better performance for ScyllaDB, so this should be considered whenever
starting it up.
This patch introduces the possibility of enforcing optimized Linux
clocksource to Scylla's setup/start-up processes. It does so by adding
an interactive question about enforcing clocksource setting to scylla_setup,
which modifies the parameter "CLOCKSOURCE" in scylla_server configuration
file. This parameter is read by perftune.py which, if set to "yes", proceeds
to (non persistently) setting the clocksource. On x86, TSC clocksource is
used.
Fixes#4474
This allows us to create/alter/drop log and desc tables "atomically"
with the base, by including these mutations in the original mutation
set, i.e. batch create/alter tables.
Note that population does not happen until types are actually
already put into database (duh), thus there _is_ still a gap
between creating cdc and it being truly usable. This may or may
not need handling later.
A general build system knows about 3 machines:
* build: where the building is running
* host: where the built software will run
* target: the machine the software will produce code for
The target machine is only relevant for compilers, so we can ignore
it.
Until now we could ignore the build and host distinction too. This
patch adds the first difference: don't use host ld_flags when linking
build tools (gen_crc_combine_table).
The reason for this change is to make it possible to build with
-Wl,--dynamic-linker pointing to a path that will exist on the host
machine, but may not exist on the build machine.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191207030408.987508-1-espindola@scylladb.com>
Empty base type makes for less boiler plate in implementations.
The "before" callbacks are for listeners who need to potentially
react/augment type creation/alteration _before_ actually
committing type to schema tables (and holding the semaphore for this).
I.e. it is for cdc to add/modify log/desc tables "atomically" with base.
The vm.swappiness sysctl controls the kernel's prefernce for swapping
anonymous memory vs page cache. Since Scylla uses very large amounts
of anonymous memory, and tiny amounts of page cache, the correct setting
is to prefer swapping page cache. If the kernel swaps anonymous memory
the reactor will stall until the page fault is satisfied. On the other
hand, page cache pages usually belong to other applications, usually
backup processes that read Scylla files.
This setting has been used in production in Scylla Cloud for a while
with good results.
Users can opt out by not installing the scylla-kernel-conf package
(same as with the other kernel tunables).
* seastar 166061da3...e440e831c (8):
> Fail tests on ubsan errors
> future: make a couple of asserts more strict
> future: Move make_ready out of line
> config: Do not allow zero rates
Fixes#5360
> future: add new state to avoid temporaries in get_available_state().
> future: avoid temporary future_state on get_available_state().
> future: inline future::abandoned
> noncopyable_function: Avoid uninitialized warning on empty types
Previously, scylla used min/max(blob)->blob overload for collections,
tuples and UDTs; effectively making the results being printed as blobs.
This PR adds "dynamically"-typed min()/max() functions for compound types.
These types can be complicated, like map<int,set<tuple<..., and created
in runtime, so functions for them are created on-demand,
similarly to tojson(). The comparison remains unchanged - underneath
this is still byte-by-byte weak lex ordering.
Fixes#5139
* jul-stas/5139-minmax-bad-printing-collections:
cql_query_tests: Added tests for min/max/count on collections
cql3: min()/max() for collections/tuples/UDTs do not cast to blobs
This tests new min/max function for collections and tuples. CFs
in test suite were named according to types being tested, e.g.
`cf_map<int,text>' what is not a valid CF name. Therefore, these
names required "escaping" of invalid characters, here: simply
replacing with '_'.
Before:
cqlsh> insert into ks.list_types (id, val) values (1, [3,4,5]);
cqlsh> select max(val) from ks.list_types;
system.max(val)
------------------------------------------------------------
0x00000003000000040000000300000004000000040000000400000005
After:
cqlsh> select max(val) from ks.list_types;
system.max(val)
--------------------
[3, 4, 5]
This is accomplished similarly to `tojson()`/`fromjson()`: functions
are generated on demand from within `cql3::functions::get()`.
Because collections can have a variety of types, including UDTs
and tuples, it would be impossible to statically define max(T t)->T
for every T. Until now, max(blob)->blob overload was used.
Because `impl_max/min_function_for` is templated with the
input/output type, which can be defined in runtime, we need type-erased
("dynamic") versions of these functors. They work identically, i.e.
they compare byte representations of lhs and rhs with
`bytes::operator<`.
Resolves#5139
If you merge a pull request that contains multiple patches via
the github interface, it will document itself as the committer.
Work around this brain damage by using the command line.
Introduce a new verb dedicated for receiving and sending hints: HINT_MUTATION. It is handled on the streaming connection, which is separate from the one used for handling mutations sent by coordinator during a write.
The intent of using a separate connection is to increase fairness while handling hints and user requests - this way, a situation can be avoided in which one type of requests saturate the connection, negatively impacting the other one.
Information about new RPC support is propagated through new gossip feature HINTED_HANDOFF_SEPARATE_CONNECTION.
Fixes#4974.
Tests: unit(release)
To make gms::inet_address::to_string() similar in output to origin.
The sole purpose being quick and easy fix of API/JMX ipv6
formatting of endpoints etc, where strings are used as lexical
comparisons instead of textual representation.
A better, but more work, solution is to fix the scylla-jmx
bridge to do explicit parse + re-format of addresses, but there
are many such callpoints.
An even better solution would be to fix nodetool to not make this
mistake of doing lexical comparisons, but then we risk breaking
merge compatibility. But could be an option for a separate
nodeprobe impl.
Message-Id: <20191204135319.1142-1-calle@scylladb.com>
Currently query_options objects is passed to a trace stopping function
which makes it mandatory to make them alive until the end of the
query. The reason for that is to add prepared statement parameters to
the trace. All other query options that we want to put in the trace are
copied into trace_state::params_values, so lets copy prepared statement
parameters there too. Trace enabled case will become a little bit more
expensive but on the other hand we can drop a continuation that holds
query_options object alive from a fast path. It is safe to drop the call
to stop_foreground_prepared() here since The tracing will be stopped
in process_request_one().
Message-Id: <20191205102026.GJ9084@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5381 by
Peng Jian, fixing multiple small issues with Redis:
* Rename the options related to Redis API, and describe them clearly.
* Rename redis_transport_port to redis_port
* Rename redis_transport_port_ssl to redis_ssl_port
* Rename redis_default_database_count to redis_database_count
* Remove unnecessary option enable_redis_protocol
* Modify the default value of opition redis_read_consistency_level and redis_write_consistency_level to LOCAL_QUORUM
* Fix the DEL command: support to delete mutilple keys in one command.
* Fix the GET command: return the empty string when the required key is not exists.
* Fix the redis-test/test_del_non_existent_key: mark xfail.
On aarch64, asan detected a use-after-move. It doesn't happen on x86_64,
likely due to different argument evaluation order.
Fix by evaluating full_slice before moving the schema.
Note: I used "auto&&" and "std::move()" even though full_slice()
returns a reference. I think this is safer in case full_slice()
changes, and works just as well with a reference.
Fixes#5419.
This commit makes sure that single-partition readers for
read-before-write do not have fast-forwarding enabled,
as it may lead to huge read amplification. The observed case was:
1. Creating an index.
CREATE INDEX index1 ON myks2.standard1 ("C1");
2. Running cassandra-stress in order to generate view updates.
cassandra-stress write no-warmup n=1000000 cl=ONE -schema \
'replication(factor=2) compaction(strategy=LeveledCompactionStrategy)' \
keyspace=myks2 -pop seq=4000000..8000000 -rate threads=100 -errors
skip-read-validation -node 127.0.0.1;
Without disabling fast-forwarding, single-partition readers
were turned into scanning readers in cache, which resulted
in reading 36GB (sic!) on a workload which generates less
than 1GB of view updates. After applying the fix, the number
dropped down to less than 1GB, as expected.
Refs #5409Fixes#4615Fixes#5418
This test execution time dominates by a serious margin
test execution time in dev/release mode: reducing its
execution time improves the test.py turnaround by over 70%.
Message-Id: <20191204135315.86374-2-kostja@scylladb.com>
Currently, 'scylla thread' uses arch_prctl() to extract the value of
fsbase, used to reference thread local variables. gdb 8 added support
for directly accessing the value as $fs_base, so use that instead. This
works from core dumps as well as live processes, as you don't need to
execute inferior functions.
The patch is required for debugging threads in core dumps, but not
sufficient, as we still need to set $rip and $rsp, and gdb still[1]
doesn't allow this.
[1] https://sourceware.org/bugzilla/show_bug.cgi?id=9370
The feature introduced by this commit declares that hints can be sent
using the new dedicated RPC verb. Before using the new verb, nodes need
to know if other nodes in the cluster will be able to handle the new
RPC verb.
Introduce a new verb dedicated for receiving and sending hints:
HINT_MUTATION. It is handled on the streaming connection, which is
separate from the one used for handling mutations sent by coordinator
during a write.
The intent of using a separate connection is to increase fariness while
handling hints and user requests - this way, a situation can be avoided
in which one type of requests saturate the connection, negatively
impacting the other one.
"
In several cases in distributed testing (dtest) we trigger compaction using nodetool compact assuming that when it is done, it is indeed really done.
However, the way compaction is currently implemented in scylla, it may leave behind some background tasks to delete the old sstables that were compacted.
This commit changes major compaction (triggered via the ss::force_keyspace_compaction api) so it would wait on the background deletes and will return only when they finish.
Fixes#4909
Tests: unit(dev), nodetool_refresh_with_data_perms_test, test_nodetool_snapshot_during_major_compaction
"
We may able to use chrony setup script on future version of RHEL/CentOS,
it better to run chrony setup when RHEL version >= 8, not only 8.
Note that on Fedora it still provides ntp/ntpdate package, so we run
ntp setup on it for now. (same on debian variants)
Signed-off-by: Takuya ASADA <syuu@scylladb.com>
Message-Id: <20191203192812.5861-1-syuu@scylladb.com>
`segment_manager' now uses a decorated version of `timed_out_error'
with hardcoded name. On the other hand `region_group' uses named
`on_request_expiry' within its `expiring_fifo'.
Fixes#5211
In 79935df959 replay apply-call was
changed from one with no continuation to one with. But the frozen
mutation arg was still just lambda local.
Change to use do_with for this case as well.
Message-Id: <20191203162606.1664-1-calle@scylladb.com>
Exception messages contain semaphore's name (provided in ctor).
This affects the queue overflow exception as well as timeout
exception. Also, custom throwing function in ctor was changed
to `prethrow_action', i.e. metrics can still be updated there but
now callers have no control over the type of the exception being
thrown. This affected `restricted_reader_max_queue_length' test.
`reader_concurrency_semaphore'-s docs are updated accordingly.
In a build configured with --debuginfo 0 the scylla binary still ends
up with some debug info from the libraries that are statically linked
in.
We should avoid compiling subprojects (including seastar) with debug
info when none is needed, but this at least avoids it showing up in
the binary.
The main motivation for this is that it is confusing to get a binary
with *some* debug info in it.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191127215843.44992-1-espindola@scylladb.com>
This series refactors the collection de/serialization code to use
fragmented buffers, avoiding the large allocations and the associated
pains when working with large collections. Currently all operations that
involve collections require deserializing them, executing the operation,
then serializing them again to their internal storage format. The
de/serialization operations happen in linearized buffers, which means
that we have to allocate a buffer large enough to hold the *entire*
collection. This can cause immense pressure on the memory allocator,
which, in the face of memory fragmentation, might be unable to serve the
allocation at all. We've seen this causing all sorts of nasty problems,
including but not limited to: failing compactions, failing memtable
flush, OOM crash and etc.
Users are strongly discouraged from using large collections, yet they
are still a fact of life and have been haunting us since forever.
The proper solution for these problems would be to come up with an
in-memory format for collections, however that is a major effort, with a
lot of unknowns. This is something we plan on doing at some point but
until it happens we should make life less painful for those with large
collections.
The goal of this series is to avoid the need of allocating these large
buffers. Serialization now happens into a `bytes_ostream` which
automatically fragments the values internally. Deserialization happens
with `utils::linearizing_input_stream` (introduced by this series), which
linearizes only the individual collection cells, but not the entire
collection.
An important goal of this series was to introduce the least amount of
risk, and hence the least amount of code. This series does not try to
make a revolution and completely revamp and optimize the
de/serialization codepaths. These codepaths have their days numbered so
investing a lot of effort into them is in vain. We can apply incremental
optimizations where we deem it necessary.
Fixes: #5341
Support to delete multiple keys in one DEL command.
The feature of returning number of the really deleted keys is still not supported.
Return empty string to client for GET command when the required key is not exists.
Fixes: #5334
Signed-off-by: Peng Jian <pengjian.uestc@gmail.com>
Rename option redis_transport_port to redis_port, which the redis transport listens on for clients.
Rename option redis_transport_port_ssl to redis_ssl_port, which the redis TLS transport listens on for clients.
Rename option redis_database_count. Set the redis dabase count.
Rename option redis_keyspace_opitons to redis_keyspace_replication_strategy_options. Set the replication strategy for redis keyspace.
Remove option enable_redis_protocol, which is unnecessary.
Fixes: #5335
Signed-off-by: Peng Jian <pengjian.uestc@gmail.com>
Commit 96009881d8 added diffutils to the dependencies via
Seastar's install-dependencies.sh, after it was inadvertantly
dropped in 1164ff5329 (update to Fedora 31; diffutils is no
longer brought in as a side effect of something else).
Regenerate the image to include diffutils.
Ref #5401.
This patch added subtests for EOF process, it reads and writes the socket
directly by using protocol cmds.
We can add more tests in future, tests with Redis module will hide some
protocol error.
Signed-off-by: Amos Kong <amos@scylladb.com>
podman needs to relabel directories in exactly the same cases docker
does. The difference is that podman cannot relabel /tmp.
The reason it was working before is that in practice anyone using
dbuild has already relabeled any directories that need relabeling,
with the exception of /tmp, since it is recreated on every boot.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191201235614.10511-2-espindola@scylladb.com>
Use `utils::linearizing_input_stream` for the deserizalization of the
collection. Allows for avoiding the linearization of the entire cell
value, instead only linearizing individual values as they are
deserialized from the buffer.
`linearizing_input_stream` allows transparently reading linearized
values from a fragmented buffer. This is done by linearizing on-the-fly
only those read values that happen to be split across multiple
fragments. This reduces the size of the largest allocation from the size
of the entire buffer (when the entire buffer is linearized) to the size
of the largest read value. This is a huge gain when the buffer contains
loads of small objects, and modest gains when the buffer contains few
large objects. But the even in the worst case the size of the largest
allocation will be less or equal compared to the case where the entire
buffer is linearized.
This stream is planned to be used as glue code between the fragmented
cell value and the collection deserialization code which expects to be
reading linearized values.
Currently the loop which writes the data from the fragmented origin to
the destination, moves to the next chunk eagerly after writing the value
of the current chunk, if the current chunk is exhausted.
This presents a problem when we are writing the last piece of data from
the last chunk, as the chunk will be exhausted and we eagerly attempt to
move to the next chunk, which doesn't exist and dereferencing it will
fail. The solution is to not be eager about moving to the next chunk and
only attempt it if we actually have more data to write and hence expect
more chunks.
The presence of `const_iterator` seems to be a requirement as well
although it is not part of the concept. But perhaps it is just an
assumption made by code using it.
Not just bytes::output_iterator. Allow writing into streams other than
just `bytes`. In fact we should be very careful with writing into
`bytes` as they require potentially large contiguous allocations.
The `write()` method is now templatized also on the type of its first
argument, which now accepts any CharOutputIterator. Due to our poor
usage of namespace this now collides with `write` defined inside
`db/commitlog/commitlog.cc`. Luckily, the latter doesn't really have to
be templatized on the data type it reads from, and de-templatizing it
resolves the clash.
Currently interactive RAID setup prompt does not list virtio-blk devices due to
following reasons:
- We fail matching '-p' option on 'lsblk --help' output since misusage of
regex functon, list_block_devices() always skipping to use lsblk output.
- We don't check existance of /dev/vd* when we skipping to use lsblk.
- We mistakenly excluded virtio-blk devices on 'lsblk -pnr' output using '-e'
option, but we actually needed them.
To fix the problem we need to use re.search() instead of re.match() to match
'-p' option on 'lsblk --help', need to add '/dev/vd*' on block device list,
then need to stop '-e 252' option on lsblk which excludes virtio-blk.
Additionally, it better to parse 'TYPE' field of lsblk output, we should skip
'loop' devices and 'rom' devices since these are not disk devices.
Fixes#4066
Signed-off-by: Takuya ASADA <syuu@scylladb.com>
Message-Id: <20191201160143.219456-1-syuu@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5392 from
Dejan Mircevski.
Refs #5034
The patches:
alternator: Implement LE operator in Expected
alternator: Implement GE operator in Expected
alternator: Make cmp diagnostic a value, not funct
utils: Add operator<< for big_decimal
alternator: Implement BETWEEN operator in Expected
All check_compare diagnostics are static strings, so there's no need
to call functions to get them. Instead of a function, make diagnostic
a simple value.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
"Fix two problem in scylla_io_setup:
- Problem 1: paths of default directories is invalid, introduced by
commit 5ec1915 ("scylla_io_setup: assume default directories under
/var/lib/scylla").
- Problem 2: wrong path join, introduced by commit 31ddb21
("dist/common/scripts: support nonroot mode on setup scripts").
Fix a problem in scylla_io_setup, scylla_fstrim and scylla_blocktune.py:
- Fixed default scylla directories when they aren't assigned in
scylla.yaml"
Fixes#5370
Reviewed-by: Pavel Emelyanov <xemul@scylladb.com>
* 'scylla_io_setup' of git://github.com/amoskong/scylla:
use parse_scylla_dirs_with_default to get scylla directories
scylla_io_setup: fix data_file_directories check
scylla_util: introduce helper to process the default scylla directories
scylla_util: get workdir by datadir() if it's not assigned in scylla.yaml
scylla_io_setup: fix path join of default scylla directories
Use asyncio as a more modern way to work with concurrency,
Process signals in an event loop, terminate all outstanding
tests before exiting.
Breaking change: this commit requires Python 3.7 or
newer to run this script. The patch adds a version
check and a message to enforce it.
Similar to trace_state keep shared_ptr<tracing> _local_tracing_ptr
in one_session_records when constructed so it can be used
during shutdown.
Fixes#5243
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
_user cannot outlive client_state class instance, so there is no point
in holding it in shared_ptr.
Tested: debug test.py and dtest auth_test.py
Message-Id: <20191128131217.26294-5-gleb@scylladb.com>
Only do_stop_rpc_server uses the shared_ptr to prolong server's
lifetime until stop() completes, but do_with() can be used to achieve the
same.
Message-Id: <20191128131217.26294-3-gleb@scylladb.com>
Only do_stop_native_transport() uses the shared_ptr to prolong server's
lifetime until stop() completes, but do_with() can be used to achieve the
same.
Message-Id: <20191128131217.26294-2-gleb@scylladb.com>
* seastar 6f0ef32514...5c25de907a (7):
> shared_future: Fix crash when all returned futures time out
Fixes#5322.
> future: don't create temporaries on get_value().
> reactor: lower the default stall threshold to 200ms
> reactor: Simplify network initialization
> reactor: Replace most std::function with noncopyable_function
> futures: Avoid extra moves in SEASTAR_TYPE_ERASE_MORE mode
> inet_address: Make inet_address == operator ignore scope (again)
Merged pull request https://github.com/scylladb/scylla/pull/5311 from
Juliusz Stasiewicz:
This is a partial solution to #5139 (only for two types) because of the
above and because collections are much harder to do. They are coming in
a separate PR.
References #5139. Aggregate functions, like max(), when invoked
on `inet_address' and `time_native_type' used to choose
max(blob)->blob overload, with casting of argument and result to
bytes. This is because appropriate calls to
`aggregate_fcts::make_XXX_function()' were missing. This commit
adds them. Functioning remains the same but now clients see
user-friendly representations of aggregate result, not binary.
Comparing inet addresses without inet::operator< is performed by
trick, where ADL is bypassed by wrapping the name of std::min/max
and providing an overload of wrapper on inet type.
Currently we support to assign workdir from scylla.yaml, and we use many
hardcode '/var/lib/scylla' in setup scripts.
Some setup scripts get scylla directories by parsing scylla.yaml, introduced
parse_scylla_dirs_with_default() that adds default values if scylla directories
aren't assigned in scylla.yaml
Signed-off-by: Amos Kong <amos@scylladb.com>
Currently we are checking an invalid path of some default scylla directories,
the directories don't exist, so the tune will always be skipped. It caused by
two problem.
Problem 1: paths of default directories is invalid
Introduced by commit 5ec191536e, we try to tune some scylla default directories
if they exist. But the directory paths we try are wrong.
For example:
- What we check: /var/lib/scylla/commitlog_directory
- Correct one: /var/lib/scylla/commitlog
Problem 2: wrong path join
Introduced by commit 31ddb2145a, default_path might be replaced from
'/var/lib/scylla/' to '/var/lib/scylla'.
Our code tries to check an invalid path that is wrongly join, eg:
'/var/lib/scyllacommitlog'
Signed-off-by: Amos Kong <amos@scylladb.com>
The default values of data_file_directories and commitlog_directory were
commented by commit e0f40ed16a. It causes scylla_util.py:get_scylla_dirs() to
fail in checking the values.
This patch changed get_scylla_dirs() to return default data/commitlog
directories if they aren't set.
Fixes#5358
Reviewed-by: Pavel Emelyanov <xemul@scylladb.com>
Signed-off-by: Amos Kong <amos@scylladb.com>
Add a test, test_query.py::test_query_limit, to verify that the Limit
parameter correctly limits the number of rows returned by the Query.
This was supposed to already work correctly - but we never had a test for
it. As we hoped, the test passes (on both Alternator and DynamoDB).
Another test, test_query.py::test_query_limit_paging, verifies that
paging can be done with any setting of Limit. We already had tests
for paging of the Scan operation, but not for the Query operation.
Refs #5153
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This is a comprehensive test for the "Select" parameter of Query and Scan
operations, but only for the base-table case, not index, so another future
patch should add similar tests in test_gsi.py and test_lsi.py as well.
The main use of the Select parameter is to allow returning just the count
of items, instead of their content, but it also has other esoteric options,
all of which we test here.
The test currently succeeds on AWS DynamoDB, demonstrating that the test
is correct, but fails on Alternator because the "Select" parameter is not
yet supported. So the test is marked xfail.
Refs #5058
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Currently the command tries to read all seastar smp queues in its
initialization code in the constructor. This constructor is run each
time `scylla-gdb.py` is sourced in `gdb` which leads to slowdowns and
sometimes also annoying errors because the sourcing happens in the wrong
context and seastar symbols are not available.
Avoid this by running this initializing code lazily, on the first
invocation.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191127095408.112101-1-bdenes@scylladb.com>
This patchset adds missing "const" function qualifiers throughout
the Scylla code base, which would make code less error-prone.
The changeset incorporates Kostja's work regarding const qualifiers
in the cql code hierarchy along with a follow-up patch addressing the
review comment of the corresponding patch set (the patch subject is
"cql: propagate const property through prepared statement tree.").
The boost 1.67 release notes says
Changed maximum supported year from 10000 to 9999 to resolve various issues
So change the test to use a larger number so that we get an exception
with both boost 1.66 and boost 1.67.
Fixes#5344
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191126180327.93545-1-espindola@scylladb.com>
Docker on Fedora 31 is flakey, and is not supported at all on RHEL 8.
Podman is a drop-in replacement for docker; this series adds support
for using podman in dbuild.
Apart from actually working on Fedora 31 hosts,
podman is nicer in being more secure and not requiring a daemon.
Fixes#5332
As suggested in issue #4586 here is the helper that prints
"shutting down foo" message, then shuts the foo down, then
prints the "[it] was successull" one. In between it catches
the exception (if any) and warns this in logs.
By "then" I mean literally then, not the seastar's then() :)
Fixes: #4586
By default, semaphore exceptions bring along very little context:
either that a semaphore was broken or that it timed out.
In order to make debugging easier without introducing significant
runtime costs, a notion of named semaphore is added.
A named semaphore is simply a semaphore with statically defined
name, which is present in its errors, bringing valuable context.
A semaphore defined as:
auto sem = semaphore(0);
will present the following message when it breaks:
"Semaphore broken"
However, a named semaphore:
auto named_sem = named_semaphore(0, named_semaphore_exception_factory{"io_concurrency_sem"});
will present a message with at least some debugging context:
"Semaphore broken: io_concurrency_sem"
It's not much, but it would really help in pinpointing bugs
without having to inspect core dumps.
At the same time, it does not incur any costs for normal
semaphore operations (except for its creation), but instead
only uses more CPU in case an error is actually thrown,
which is considered rare and not to be on the hot path.
Refs #4999
Tests: unit(dev), manual: hardcoding a failure in view building code
Currently scylla_io_setup will skip in scylla_setup, because we didn't support
those new instance types.
I manually executed scylla_io_setup, and the scylla-server started and worked
well.
Let's apply this patch first, then check if there is some new problem in
ami-test.
Signed-off-by: Amos Kong <amos@scylladb.com>
cql_statement is a class representing a prepared statement in Scylla.
It is used concurrently during execution, so it is important that its
change is not changed by execution.
Add const qualifier to the execution methods family, throghout the
cql hierarchy.
Mark a few places which do mutate prepared statement state during
execution as mutable. While these are not affecting production today,
as code ages, they may become a source of latent bugs and should be
moved out of the prepared state or evaluated at prepare eventually:
cf_property_defs::_compaction_strategy_class
list_permissions_statement::_resource
permission_altering_statement::_resource
property_definitions::_properties
select_statement::_opts
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
v2:
- Have stop easrlier so that exception in start/listen do
not prevent prometheu.stop from calling
As suggested in issue #4586 here is the helper that prints
"shutting down foo" message, then shuts the foo down, then
prints the "shutting down foo was successfull". In between
it catches the exception (if any) and warns this in logs.
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5218
from Piotr Jastrzębski:
Users should be able to decide whether they need preimage or not. There is
already an option for that but it's not respected by the implementation.
This PR adds support for this functionality.
Tests: unit(dev).
Individual patches:
cdc: Don't take storage_proxy as transformer::pre_image_select param
cdc::append_log_mutations: use do_with instead of shared_ptr
cdc::append_log_mutations: fix undefined behavior
cdc: enable preimage in test_pre_image_logging test
cdc: Return preimage only when it's requested
cdc: test both enabled and disabled preimage in test_pre_image_logging
Before stopping the db itself, stop the migration service.
It must be stopped before RPC, but RPC is not stopped yet
itself, so we should be safe here.
Here's the tail of the resulting logs:
INFO 2019-11-20 11:22:35,193 [shard 0] init - shutdown migration manager
INFO 2019-11-20 11:22:35,193 [shard 0] migration_manager - stopping migration service
INFO 2019-11-20 11:22:35,193 [shard 1] migration_manager - stopping migration service
INFO 2019-11-20 11:22:35,193 [shard 0] init - Shutdown database started
INFO 2019-11-20 11:22:35,193 [shard 0] init - Shutdown database finished
INFO 2019-11-20 11:22:35,193 [shard 0] init - stopping prometheus API server
INFO 2019-11-20 11:22:35,193 [shard 0] init - Scylla version 666.development-0.20191120.25820980f shutdown complete.
Also -- stop the mm on drain before the commitlog it stopped.
[Tomasz: mm needs the cl because pulling schema changes from other nodes
involves applying them into the database. So cl/db needs to be
stopped after mm is stopped.]
The drain logs would look like
...
INFO 2019-11-25 11:00:40,562 [shard 0] migration_manager - stopping migration service
INFO 2019-11-25 11:00:40,562 [shard 1] migration_manager - stopping migration service
INFO 2019-11-25 11:00:40,563 [shard 0] storage_service - DRAINED:
and then on stop
...
INFO 2019-11-25 11:00:46,427 [shard 0] init - shutdown migration manager
INFO 2019-11-25 11:00:46,427 [shard 0] init - Shutdown database started
INFO 2019-11-25 11:00:46,427 [shard 0] init - Shutdown database finished
INFO 2019-11-25 11:00:46,427 [shard 0] init - stopping prometheus API server
INFO 2019-11-25 11:00:46,427 [shard 0] init - Scylla version 666.development-0.20191125.3eab6cd54 shutdown complete.
Fixes#5300
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20191125080605.7661-1-xemul@scylladb.com>
In get_full_row_hashes_with_rpc_stream and
repair_get_row_diff_with_rpc_stream_process_op which were introduced in
the "Repair switch to rpc stream" series, rx_hashes_nr metrics are not
updated correctly.
In the test we have 3 nodes and run repair on node3, we makes sure the
following metrics are correct.
assertEqual(node1_metrics['scylla_repair_tx_hashes_nr'] + node2_metrics['scylla_repair_tx_hashes_nr'],
node3_metrics['scylla_repair_rx_hashes_nr'])
assertEqual(node1_metrics['scylla_repair_rx_hashes_nr'] + node2_metrics['scylla_repair_rx_hashes_nr'],
node3_metrics['scylla_repair_tx_hashes_nr'])
assertEqual(node1_metrics['scylla_repair_tx_row_nr'] + node2_metrics['scylla_repair_tx_row_nr'],
node3_metrics['scylla_repair_rx_row_nr'])
assertEqual(node1_metrics['scylla_repair_rx_row_nr'] + node2_metrics['scylla_repair_rx_row_nr'],
node3_metrics['scylla_repair_tx_row_nr'])
assertEqual(node1_metrics['scylla_repair_tx_row_bytes'] + node2_metrics['scylla_repair_tx_row_bytes'],
node3_metrics['scylla_repair_rx_row_bytes'])
assertEqual(node1_metrics['scylla_repair_rx_row_bytes'] + node2_metrics['scylla_repair_rx_row_bytes'],
node3_metrics['scylla_repair_tx_row_bytes'])
Tests: repair_additional_test.py:RepairAdditionalTest.repair_almost_synced_3nodes_test
Fixes: #5339
Backports: 3.2
The code was iterating over a collection that was modified
at the same time. Iterators were used for that and collection
modification can invalidate all iterators.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Merged pull request https://github.com/scylladb/scylla/pull/5310 from
Avi Kivity:
This is a minor update as gcc and boost versions did not change. A noteable
update is patchelf 0.10, which adds support to large binaries.
A few minor issues exposed by the update are fixed in preparatory patches.
Patches:
dist: rpm: correct systemd post-uninstall scriptlet
build: force xz compression on rpm binary payload
tools: toolchain: update to Fedora 31
Since 90d6c0b, cache will abort when trying to detach partition
entries while they're updated. This should never happen. It can happen
though, when the update fails on bad_alloc, because the cleanup guard
invalidates the cache before it releases partition snapshots (held by
"update" coroutine).
Fix by destroying the coroutine first.
Fixes#5327.
Tests:
- row_cache_test (dev)
Message-Id: <1574360259-10132-1-git-send-email-tgrabiec@scylladb.com>
By default rpm uses dwz to merge the debug info from various
binaries. Unfortunately, it looks like addr2line has not been updated
to handle this:
// This works
$ addr2line -e build/release/scylla 0x1234567
$ dwz -m build/release/common.debug build/release/scylla.debug build/release/iotune.debug
// now this fails
$ addr2line -e build/release/scylla 0x1234567
I think the issue is
https://sourceware.org/bugzilla/show_bug.cgi?id=23652Fixes#5289
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191123015734.89331-1-espindola@scylladb.com>
By default we were compressing debug info only in release
executables. The idea, if I understand it correctly, is that those are
the ones we ship, so we want a more compact binary.
I don't think that was doing anything useful. The compression is just
gzip, so when we ship a .tar.xz, having the debug info compressed
inside the scylla binary probably reduces the overall compression a
bit.
When building a rpm the situation in amusing. As part of the rpm
build process the debug info is decompressed and extracted to an
external file.
Given that most of the link time goes to compressing debug info, it is
probably a good idea to just skip that.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191123022825.102837-1-espindola@scylladb.com>
Structure the code to be able to introduce futures.
Apply trivial cleanups.
Switch to asyncio and use it to work with processes and
handle signals. Cleanup all processes upon signal.
This patch implements a simple optimization for LWT: it makes PAXOS
prepare phase query locally and return the current value of the modified
key so that a separate query is not necessary. For more details see
patch 6. Patch 1 fixes a bug in next. Patches 2-5 contain trivial
preparatory refactoring.
Current LWT implementation uses at least three network round trips:
- first, execute PAXOS prepare phase
- second, query the current value of the updated key
- third, propose the change to participating replicas
(there's also learn phase, but we don't wait for it to complete).
The idea behind the optimization implemented by this patch is simple:
piggyback the current value of the updated key on the prepare response
to eliminate one round trip.
To generate less network traffic, only the closest to the coordinator
replica sends data while other participating replicas send digests which
are used to check data consistency.
Note, this patch changes the API of some RPC calls used by PAXOS, but
this should be okay as long as the feature in the early development
stage and marked experimental.
To assess the impact of this optimization on LWT performance, I ran a
simple benchmark that starts a number of concurrent clients each of
which updates its own key (uncontended case) stored in a cluster of
three AWS i3.2xlarge nodes located in the same region (us-west-1) and
measures the aggregate bandwidth and latency. The test uses shard-aware
gocql driver. Here are the results:
latency 99% (ms) bandwidth (rq/s) timeouts (rq/s)
clients before after before after before after
1 2 2 626 637 0 0
5 4 3 2616 2843 0 0
10 3 3 4493 4767 0 0
50 7 7 10567 10833 0 0
100 15 15 12265 12934 0 0
200 48 30 13593 14317 0 0
400 185 60 14796 15549 0 0
600 290 94 14416 15669 0 0
800 568 118 14077 15820 2 0
1000 710 118 13088 15830 9 0
2000 1388 232 13342 15658 85 0
3000 1110 363 13282 15422 233 0
4000 1735 454 13387 15385 329 0
That is, this optimization improves max LWT bandwidth by about 15%
and allows to run 3-4x more clients while maintaining the same level
of system responsiveness.
invoke_on() guarantees that captures object won't be destroyed until the
future returned by the invoked function is resolved so there's no need
to move key, token, proposal for calling paxos_state::*_impl helpers.
The test_health_only_works_for_root_path test checks that while Alternator's
HTTP server responds to a "GET /" request with success ("health check"), it
should respond to different URLs with failures (page not found).
One of the URLs it tested was "/..", but unfortunately some versions of
Python's HTTP client canonize this request to just a "/", causing the
request to unexpectedly succeed - and the test to fail.
So this patch just drops the "/.." check. A few other nonsense URLs are
attempted by the test - e.g., "/abc".
Fixes#5321
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
One of the fields still missing in DescribeTable's response (Refs #5026)
was the table's schema - KeySchema and AttributeDefinitions.
This patch adds this missing feature, and enables the previously-xfailing
test test_describe_table_schema.
A complication of this patch is that in a table with secondary indexes,
we need to return not just the base table's schema, but also the indexes'
schema. The existing tests did not cover that feature, so we add here
two more tests in test_gsi.py for that.
One of these secondary-index schema tests, test_gsi_2_describe_table_schema,
still fails, because it outputs a range-key which Scylla added to a view
because of its own implementation needs, but wasn't in the user's
definition of the GSI. I opened a separate issue #5320 for that.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Serialize reference_wrapper<T> as T and make sure is_equivalent<> treats
reference_wrapper<T> wrapped in std::optional<> or std::variant<>, or
std::tuple<> as T.
We need it to avoid copying query::result while serializing
paxos::promise.
Currently even if `-a` or `-s 0` is provided, `scylla task_histogram`
will scan a limited amount of pages due to a bug in the scan loop's stop
condition, which will be trigger a stop once the default sample limit is
reached. Fix the loop by skipping this check when the user wants to scan
all tasks.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191121141706.29476-1-bdenes@scylladb.com>
At least some versions of 'podman logs --follow' hang when the
container eventually exits (also happens with docker on recent
versions). Fortunately, we don't need to use 'podman logs --follow'
and can use the more natural non-detached 'podman run', because
podman does not proxy SIGTERM and instead shuts down the container
when it receives it.
So, to work around the problem, use the same code path in interactive
and non-interactive runs, when podman is in use instead of docker.
With docker, we went to considerable lengths to ensure that
access to mounted volume was done using the calling user, including
supplementary groups. This avoids root-owned files being left around
after a build, and ensures that access to group-shared files (like
/var/cache/ccache) works as expected.
All of this is unnecessary and broken when using podman. Podman
uses a proxy to access files on behalf of the container, so naturally
all access is done using the calling user's identity. Since it remaps
user and group IDs, assigning the host uid/gid is meaningless. Using
--userns host also breaks, because sudo no longer works.
Fix this by making all the uid/gid/selinux games specific to docker and
ignore them when using podman. To preserve the functionality of tools
that depend on $HOME, set that according to the host setting.
The original fix (10f6b125c8) didn't
take into account that if there was a failed memtable flush (Refs
flush) but is not a flushable memtable because it's not the latest in
the memtable list. If that happens, it means no other memtable is
flushable as well, cause otherwise it would be picked due to
evictable_occupancy(). Therefore the right action is to not flush
anything in this case.
Suspected to be observed in #4982. I didn't manage to reproduce after
triggering a failed memtable flush.
Fixes#3717
podman refuses to start with duplicate volumes, which routinely
happen if the toplevel directory is the working directory. Detect
this and avoid the duplicate.
UnitTest class uses juggles with the name 'args' quite a bit to
construct the command line for a unit test, so let's spread
the harness command line arguments from the unit test command line
arguments a bit apart by consistently calling the harness command line
arguments 'options', and unit test command line arguments 'args'.
Rename usage() to parse_cmd_line().
Create unique UnitTest objects in find_tests() for each found match,
including repeat, to ensure each test has its own unique id.
This will also be used to store execution state in the test.
It somewhat stands in the way of using asyncio
This patch also implements a more comprehensive
fix for #5303, since we not only have --repeat, but
run some tests in different configurations, in which
case xml output is also overwritten.
When starting scylla daemon as non-root the initialization fails
because standard /var/lib/scylla is not accessible by regular users.
Making the default dir accessible for user is not very convenient
either, as it will cause conflicts if two or more instances of scylla
are in use.
This problem can be resolved by specifying --commitlog-directory,
--data-file-directories, etc on start, but it's too much typing. I
propose to revive Nadav's --home option that allows to move all the
directories under the same prefix in one go.
Unlike Nadav's approach the --workdir option doesn't do any tricky
manipulations with existing directories. Insead, as Pekka suggested,
the individual directories are placed under the workir if and only
if the respective option is NOT provided. Otherwise the directory
configuration is taken as is regardless of whether its absolute or
relative path.
The values substutution is done early on start. Avi suggested that
this is unsafe wrt HUP config re-read and proper paths must be
resolved on the fly, but this patch doesn't address that yet, here's
why.
First of all, the respective options are MustRestart now and the
substitution is done before HUP handler is installed.
Next, commitlog and data_file values are copied on start, so marking
the options as LiveUpdate won't make any effect.
Finally, the existing named_value::operator() returns a reference,
so returning a calculated (and thus temporary) value is not possible
(from my current understanding, correct me if I'm wrong). Thus if we
want the *_directory() to return calculated value all callers of them
must be patched to call something different (e.g. *_directory.get() ?)
which will lead to more confusion and errors.
Changes v3:
- the option is --workdir back again
- the existing *directory are only affected if unset
- default config doesn't have any of these set
- added the short -W alias
Changes v2:
- the option is --home now
- all other paths are changed to be relative
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20191119130059.18066-1-xemul@scylladb.com>
I found these mismatched types while converting some member functions
to standalone functions, since they have to use the public API that
has more type checks.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20191120181213.111758-4-espindola@scylladb.com>
Use pkg-config to search for Lua dependencies rather
than hard-code include and link paths.
Avoid using boost internals, not present in earlier
versions of boost.
Reviewed-by: Rafael Avila de Espindola <espindola@scylladb.com>
Message-Id: <20191120170005.49649-1-kostja@scylladb.com>
Use `-Wl,--threads` flag to enable multi-threaded linking when
using `ld.gold` linker.
Additional compilation test is required because it depends on whether
or not the `gold` linker has been compiled with `--enable-threads` option.
This patch introduces a substantial improvement to the link times of
`scylla` binary in release and debug modes (around 30 percent).
Local setup reports the following numbers with release build for
linking only build/release/scylla:
Single-threaded mode:
Elapsed (wall clock) time (h:mm:ss or m:ss): 1:09.30
Multi-threaded mode:
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:51.57
Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20191120163922.21462-1-pa.solodovnikov@scylladb.com>
Merged patch series from Peng Jian, adding optionally-enabled Redis API
support to Scylla. This feature is experimental, and partial - the extent
of this support is detailed in docs/redis/redis.md.
Patches:
Document: add docs/redis/redis.md
redis: Redis API in Scylla
Redis API: graft redis module to Scylla
redis-test: add test cases for Redis API
This is a minor update as gcc and boost versions do not change.
glibc-langpack-en no longer gets pulled in by default. As it is required
by some locale use somewhere, it is added to the explicit dependencies.
Fedora 31 switched the default compression to zstd, which isn't readable
by some older rpm distributions (CentOS 7 in particular). Tell it to use
the older xz compression instead, so packages produced on Fedora 31 can
be installed on older distributions.
The post-uninstall scriptlet requires a parameter, but older versions
of rpm survived without it. Fedora 31's rpm is more strict, so supply
this parameter.
In this document, the detailed design and implementation of Redis API in
Scylla is provided.
v2: build: work around ragel 7 generated code bug (suggested by Avi)
Ragel 7 incorrectly emits some unused variables that don't compile.
As a workaround, sed them away.
Signed-off-by: Peng Jian <pengjian.uestc@gmail.com>
Signed-off-by: Amos Kong <amos@scylladb.com>
Scylla has advantage and amazing features. If Redis build on the top of Scylla,
it has the above features automatically. It's achived great progress
in cluster master managment, data persistence, failover and replication.
The benefits to the users are easy to use and develop in their production
environment, and taking avantages of Scylla.
Using the Ragel to parse the Redis request, server abtains the command name
and the parameters from the request, invokes the Scylla's internal API to
read and write the data, then replies to client.
Signed-off-by: Peng Jian, <pengjian.uestc@gmail.com>
Merged patch set by Piotr Dulikowski:
This change corrects condition on which a row was considered expired by its
TTL.
The logic that decides when a row becomes expired was inconsistent with the
logic that decides if a single cell is expired. A single cell becomes expired
when expiry_timestamp <= now, while a row became expired when
expiry_timestamp < now (notice the strict inequality). For rows inserted
with TTL, this caused non-key cells to expire (change their values to null)
one second before the row disappeared. Now, row expiry logic uses non-strict
inequality.
Fixes#4263,
Fixes#5290.
Tests:
unit(dev)
python test described in issue #5290
It is useful to have an option to limit the execution time of a shell
script.
This patch adds an optional timeout parameter, if a parameter will be
provided a command will return and failure if the duration is passed.
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
Merged patch series from Juliusz Stasiewicz:
Welcome to my first PR to Scylla!
The task was intended as a warm-up ("noob") exercise; its description is
here: #4182 Sorry, I also couldn't help it and did some scouting: edited
descriptions of some metrics and shortened few annoyingly long LoC.
Those are typically symptoms of use-after-free or memory corruption in
the program. It's better to catch such error sooner than later.
That situation is also dangerous since if a valid descriptor would
land under the invalid access, not the one which was intended for the
operation, then the operation may be performed on the wrong file and
result in corruption.
Message-Id: <1565206788-31254-1-git-send-email-tgrabiec@scylladb.com>
This change corrects condition on which a row was considered expired by
its TTL.
The logic that decides when a row becomes expired was inconsistent with
the logic that decides if a single cell is expired. A single cell
becomes expired when `expiry_timestamp <= now`, while a row became
expired when `expiry_timestamp < now` (notice the strict inequality).
For rows inserted with TTL, this caused non-key cells to expire (change
their values to null) one second before the row disappeared. Now, row
expiry logic uses non-strict inequality.
Fixes: #4263, #5290.
Tests:
- unit(dev)
- python test described in issue #5290
Currently, we overwrite the same XML output file for each test repeat
cycle. This can cause invalid XML to be generated if the XML contents
don't match exactly for every iteration.
Fix the problem by appending the test repeat cycle in the XML filename
as follows:
$ ./test.py --repeat 3 --name vint_serialization_test --mode dev --jenkins jenkins_test
$ ls -1 *.xml
jenkins_test.release.vint_serialization_test.0.boost.xml
jenkins_test.release.vint_serialization_test.1.boost.xml
jenkins_test.release.vint_serialization_test.2.boost.xml
Fixes#5303.
Message-Id: <20191119092048.16419-1-penberg@scylladb.com>
In a cross-dc large cluster, the receiver node of the gossip SYN message
might be slow to send the gossip ACK message. The ack messages can be
large if the payload of the application state is big, e.g.,
CACHE_HITRATES with a lot of tables. As a result, the unlimited ACK
message can consume unlimited amount of memory which causes OOM
eventually.
To fix, this patch queues the SYN message and handles it later if the
previous ACK message is still being sent. However, we only store the
latest SYN message. Since the latest SYN message from peer has the
latest information, so it is safe to drop the previous SYN message and
keep the latest one only. After this patch, there can be at most 1
pending SYN message and 1 pending ACK message per peer node.
"
This patch series adds only UDF support, UDA will be in the next patch series.
With this all CQL types are mapped to Lua. Right now we setup a new
lua state and copy the values for each argument and return. This will
be optimized once profiled.
We require --experimental to enable UDF in case there is some change
to the table format.
"
* 'espindola/udf-only-v4' of https://github.com/espindola/scylla: (65 commits)
Lua: Document the conversions between Lua and CQL
Lua: Implement decimal subtraction
Lua: Implement decimal addition
Lua: Implement support for returning decimal
Lua: Implement decimal to string conversion
Lua: Implement decimal to floating point conversion
Lua: Implement support for decimal arguments
Lua: Implement support for returning varint
Lua: Implement support for returning duration
Lua: Implement support for duration arguments
Lua: Implement support for returning inet
Lua: Implement support for inet arguments
Lua: Implement support for returning time
Lua: Implement support for time arguments
Lua: Implement support for returning timeuuid
Lua: Implement support for returning uuid
Lua: Implement support for uuid and timeuuid arguments
Lua: Implement support for returning date
Lua: Implement support for date arguments
Lua: Implement support for returning timestamp
...
Add mode_list rule to ninja build and use it by default when searching
for tests in test.py.
Now it is no longer necessary to explicitly specify the test mode when
invoking test.py.
(cherry picked from commit a211ff30c7f2de12166d8f6f10d259207b462d4b)
The goal of this patch is to fix issue #5280, a rather serious Alternator
bug, where Scylla fails to restart when an Alternator table has secondary
indexes (LSI or GSI).
Traditionally, Cassandra allows table names to contain only alphanumeric
characters and underscores. However, most of our internal implementation
doesn't actually have this restriction. So Alternator uses the characters
':' and '!' in the table names to mark global and local secondary indexes,
respectively. And this actually works. Or almost...
This patch fixes a problem of listing, during boot, the sstables stored
for tables with such non-traditional names. The sstable listing code
needlessly assumes that the *directory* name, i.e., the CF names, matches
the "\w+" regular expression. When an sstable is found in a directory not
matching such regular expression, the boot fails. But there is no real
reason to require such a strict regular expression. So this patch relaxes
this requirement, and allows Scylla to boot with Alternator's GSI and LSI
tables and their names which include the ":" and "!" characters, and in
fact any other name allowed as a directory name.
Fixes#5280.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191114153811.17386-1-nyh@scylladb.com>
This document adds information about how fixes are tracked to be
backported into releases and what is the procedure that is followed to
backport those fixes.
Signed-off-by: Shlomi Livne <shlomi@scylladb.com>
Allow filtering the resolved addresses by a startswith string.
The common use case if for resolving vtable ptrs, when resolving
the output of `find_vptrs` that may be too long for the host
(running gdb) memory size. In this case the number of vtable
ptrs is considerably smaller than the total number of objects
returned by find_ptrs (e.g. 462 vs. 69625 in a OOM core I
examined from scylla --smp=2 --memory=1024M)
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
CQL tracing would only report file I/O involving one sstable, even if
multiple sstables were read from during the query.
Steps to reproduce:
create a table with NullCompactionStrategy
insert row, flush memtables
insert row, flush memtables
restart Scylla
tracing on
select * from table
The trace would only report DMA reads from one of the two sstables.
Kudos to @denesb for catching this.
Related issue: #4908
There are ... signs of massive start/stop code rework in the
main() function. While fixing the sub-modules interdependencies
during start/stop I've polished these signs too, so here's the
simplest ones.
Serialize provided partition_key in such a way that the serialized value
will hash to the same token as the original key. This way when system.paxos
table is updated the update is shard local.
Message-Id: <20191114135449.GU10922@scylladb.com>
"
When using INSERT JSON with frozen collection/UDT columns, if the columns were left unspecified or set to null, the statement would create an empty non-null value for these columns instead of using null values as it should have. For example:
cqlsh:b> create table t (k text primary key, l frozen<list<int>>, m frozen<map<int, int>>, s frozen<set<int>>, u frozen<ut>);
cqlsh:b> insert into t JSON '{"k": "insert_json"}';
cqlsh:b> select * from t;
k | l | m | s | u
-------------------+------+------+------+------
insert_json | [] | {} | {} |
This PR fixes this.
Resolves#5246 and closes#5270.
"
* 'frozen-json' of https://github.com/kbr-/scylla:
tests: add null/unset frozen collection/UDT INSERT JSON test
cql3: correctly handle frozen null/unset collection/UDT columns in INSERT JSON
cql3: decouple execute from term binding in user_type::setter
* seastar 75e189c6ba...6f0ef32514 (6):
> Merge "Add named semaphores" from Piotr
> parallel_for_each_state: pass rvalue reference to add_future
> future: Pass rvalue to uninitialized_wrapper::uninitialized_set.
> dependencies: Add libfmt-dev to debian
> log: Fix logger behavior when logging both to stdout and syslog.
> README.md: list Scylla among the projects using Seastar
If CONSISTENCY is set to SERIAL or LOCAL SERIAL, all write requests must
fail according to Cassandra's documentation. However, batched writes
bypass this check. Fix this.
This patch resurrects Cassandra's code validating a consistency level
for CAS requests. Basically, it makes CAS requests use a special
function instead of validate_for_write to make error messages more
coherent.
Note, we don't need to resurrect requireNetworkTopologyStrategy as
EACH_QUORUM should work just fine for both CAS and non-CAS writes.
Looks like it is just an artefact of a rebase in the Cassandra
repository.
The dependencies are provided by the frozen toolchain. If a dependency
is missing, we must update the toolchain rather than rely on build-time
installation, which is not reproducible (as different package versions
are available at different times).
Luckily "dnf install" does not update an already-installed package. Had
that been a case, none of our builds would have been reproducible, since
packages would be updated to the latest version as of the build time rather
than the version selected by the frozen toolchain.
So, to prevent missing packages in the frozen toolchain translating to
an unreproducible build, remove the support for installing dependencies
from reloc/build_reloc.sh. We still parse the --nodeps option in case some
script uses it.
Fixes#5222.
Tests: reloc/build_reloc.sh.
MV backpressure code frees mutation for delayed client replies earlier
to save memory. The commit 2d7c026d6e that
introduced the logic claimed to do it only when all replies are received,
but this is not the case. Fix the code to free only when all replies
are received for real.
Fixes#5242
Message-Id: <20191113142117.GA14484@scylladb.com>
Resharding is responsible for the scheduling the deletion of sstables
resharded, but it was not refreshing the cache of the shards those
sstables belong to, which means cache was incorrectly holding reference
to them even after they were deleted. The consequence is sstables
deleted by resharding not having their disk space freed until cache
is refreshed by a subsequent procedure that triggers it.
Fixes#5261.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20191107193550.7860-1-raphaelsc@scylladb.com>
* 'cql-trivial-cleanup' of ssh://github.com/scylladb/scylla-dev:
cql: rename modification_statement::_sets_a_collection to _selects_a_collection
cql: rename _column_conditions to _regular_conditions
cql: remove unnecessary optional around prefetch_data
"
Use a fixed-size, rather than a dynamically growing
bitset for column mask. This avoids unnecessary memory
reallocation in the most common case.
"
* 'column_set' of ssh://github.com/scylladb/scylla-dev:
schema: pre-allocate the bitset of column_set
schema: introduce schema::all_columns_count()
schema: rename column_mask to column_set
Adds per-table metrics for counting partition and row reuse
in memtables. New metrics are as follows:
- memtable_partition_writes - number of write operations performed
on partitions in memtables,
- memtable_partition_hits - number of write operations performed
on partitions that previously existed in a memtable,
- memtable_row_writes - number of row write operations performed
in memtables,
- memtable_row_hits - number of row write operations that ovewrote
rows previously present in a memtable.
Tests: unit(release)
Merged patch series from Dejan Mircevski. Implements the "LT" and "GT"
operators of the Expected update option (i.e., conditional updates),
and enables the pre-existing tests for them.
Since it contains a precise set of columns, it's more
accurate to call it a set, not a mask. Besides, the name
column_mask is already used for column options on storage
level.
This is merely to avoid confusion: we use _sets prefix to indicate that
there are operations over static/regular columns (_sets_static_columns,
_sets_regular_columns), but _sets_a_collection is set for both operations
and conditions. So let's rename it to _selects_a_collection and add some
comments.
It's weird that modification_statement has _static_conditions for
conditions on static columns and _column_conditions for conditions on
regular columns, as if conditions on static columns are not column
conditions. Let's rename _column_conditions to _regular_conditions to
avoid confusion.
Before this commit, an empty non-null value was created for
frozen collection/UDT columns when an INSERT JSON statement was executed
with the value left unspecified or set to null.
This was incompatible with Cassandra which inserted a null (dead cell).
Fixes#5270.
--pkg option on install.sh is introduced for .deb packaging since it requires
different install directory for each subpackage.
But we actually able to use "debian/tmp" for shared install directory,
then we can specify file owner of the package using .install files.
Signed-off-by: Takuya ASADA <syuu@scylladb.com>
Message-Id: <20191030203142.31743-1-syuu@scylladb.com>
Adds per-table metrics for counting partition and row reuse
in memtables. New metrics are as follows:
- memtable_partition_writes - number of write operations performed
on partitions in memtables,
- memtable_partition_hits - number of write operations performed
on partitions that previously existed in a memtable,
- memtable_row_writes - number of row write operations performed
in memtables,
- memtable_row_hits - number of row write operations that ovewrote
rows previously present in a memtable.
Tests: unit(release)
This is just the minimum to pass a value to Lua. Right now you can't
actually do anything with it.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This adds support for all integer types. Followup commits will
implement the missing types.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This makes it substantially simpler to support both varint and
decimal, which will be implemented in a followup patch.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
With this we support all simple integer types. Followup patches will
implement the missing types.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This add a wrapper around the lua interpreter so that function
executions are interruptible and return futures.
With this patch it is possible to write and use simple UDFs that take
and return integer values.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This adds a requires_thread predicate to functions and propagates that
up until we get to code that already returns futures.
We can then use the predicate to decide if we need to use
seastar::async.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This refactors test_schema_digest_does_not_change to also test a
schema with user defined functions and user defined aggregates.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
With this it is possible to create user defined functions and
aggregates and they are saved to disk and the schema change is
propagated.
It is just not possible to call them yet.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The parser now rejects having both OR REPLACE and IF NOT EXISTS in the
same statement.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This updates UDF syntax to the current specification.
In particular, this removes DETERMINISTIC and adds "CALLED ON NULL
INPUT" and "RETURNS NULL ON NULL INPUT".
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
At some point we should make the function list non static, but this
allows us to write tests for now.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This avoids allocating a std::vector and is more flexible since the
iterator can be passed to erase.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This is a simple wrapper that allows code that is not in the types
hierarchy to visit a data_value.
Will be used by UDF.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This change adds a SCYLLA_REPO_URL argument to Dockerfile, which defines
the RPM repository used to install Scylla from.
When building a new Docker image, users can specify the argument by
passing the --build-arg SCYLLA_REPO_URL=<url> option to the docker build
command. If the argument is not specified, the same RPM repository is
used as before, retaining the old default behavior.
We intend to use this in release engineering infrastructure to specify
RPM repositories for nightly builds of release branches (for example,
3.1.x), which are currently only using the stable RPMs.
Code for check_LT(), check_GT(), etc. will be nearly identical, so
factor it out into a single function that takes a comparator object.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
In 1ca9dc5d47, it was established that the correct way to
base64-decode a JSON value is via string_view, rather than directly
from GetString().
This patch adds a base64_decode(rjson::value) overload, which
automatically uses the correct procedure. It saves typing, ensures
correctness (fixing one incorrect call found), and will come in handy
for future EXPECTED comparisons.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
unwrap_number() is now a public function in serialization.hh instead
of a static function visible only in executor.cc.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
Merged patch series from Piotr Sarna:
An otherwise empty partition can still have a valid static column.
Filtering didn't take that fact into account and only filtered
full-fledged rows, which may result in non-matching rows being returned
to the client.
Fixes#5248
"type" label is already in use for the counter type ("derive", "gauge",
etc). Using the same label for "cas" / "non-cas" overwrites it. Let's
instead call the new label "conditional" and use "yes" / "no" for its
value, as suggested by Kostja.
Message-Id: <3082b16e4d6797f064d58da95fb4e50b59ab795c.1572451480.git.vdavydov@scylladb.com>
"
In case when a single reader contributes a stream of fragments and keeps winning over other readers, mutation_reader_merger will enter gallop mode, in which it is assumed that the reader will keep winning over other readers. Currently, a reader needs to contribute 3 fragments to enter that mode.
In gallop mode, fragments returned by the galloping reader will be compared with the best fragment from _fragment_heap. If it wins, the fragment is directly returned. Otherwise, gallop mode ends and merging performed as in general case, which involves heap operations.
In current implementation, when the end of partition is encountered while in gallop mode, the gallop mode is ended unconditionally.
A microbenchmark was added in order to test performance of the galloping reader optimization. A combining reader that merges results from four other readers is created. Each sub-reader provides a range of 32 clustering rows that is disjoint from others. All sub-readers return rows from the same partition. An improvement can be observed after introducing the galloping reader optimization.
As for other benchmarks from the "combined" group, results are pretty close to the old ones. The only one that seems to have suffered slightly is combined.many_overlapping.
Median times from a single run of perf_mutation_readers.combined: (1s run duration, 5 runs per benchmark, release mode)
test name before after improvement
one_row 49.070ns 48.287ns 1.60%
single_active 61.574us 61.235us 0.55%
many_overlapping 488.193us 514.977us -5.49%
disjoint_interleaved 57.462us 57.111us 0.61%
disjoint_ranges 56.545us 56.006us 0.95%
overlapping_partitions_disjoint_rows 127.039us 80.849us 36.36%
Same results, normalized per mutation fragment:
test name before after improvement
one_row 16.36ns 16.10ns 1.60%
single_active 109.46ns 108.86ns 0.55%
many_overlapping 216.97ns 228.88ns -5.49%
disjoint_interleaved 102.15ns 101.53ns 0.61%
disjoint_ranges 100.52ns 99.57ns 0.95%
overlapping_partitions_disjoint_rows 246.38ns 156.80ns 36.36%
Tested on AMD Ryzen Threadripper 2950X @ 3.5GHz.
Tests: unit(release)
Fixes#3593.
"
* '3593-combined_reader-gallop-mode' of https://github.com/piodul/scylla:
mutation_reader: gallop mode microbenchmark
mutation_reader: combined reader gallop tests
mutation_reader: gallop mode for combined reader
mutation_reader: refactor prepare_next
An otherwise empty partition can still have a valid static column.
Filtering didn't take that fact into account and only filtered
full-fledged rows, which may result in non-matching rows being returned
to the client.
Fixes#5248
Update previous results dictionary using the update_metrics method.
It calls metric_source.query_list to get a list of results (similar to discover()) then for each line in the response it updates results dictionary.
New results may be appeneded depending on the do_append parameter (True by default).
Previously, with prometheous, each metric.update called query_list resulting in O(n^2) when all metric were updated, like in the scylla_top dtest - causing test timeout when testing debug build.
(E.g. dtest-debug/216/testReport/scyllatop_test/TestScyllaTop/default_start_test/)
This patch adds "type" label to the following CQL metrics:
inserts
updates
deletes
batches
statements_in_batches
The label is set to "cas" for conditional statements and "non-cas" for
unconditional statements.
Note, for a batch to be accounted as CAS, it is enough to have just one
conditional statement. In this case all statements within the batch are
accounted as CAS as well.
This microbenchmark tests performance of the galloping reader
optimization. A combining reader that merges results from four other
readers is created. Each sub-reader provides a range of 32 clustering
rows that is disjoint from others. All sub-readers return rows from
the same partition. An improvement can be observed after introducing the
galloping reader optimization.
As for other benchmarks from the "combined" group, results are pretty
close to the old ones. The only one that seems to have suffered slightly
is combined.many_overlapping.
Median times from a single run of perf_mutation_readers.combined:
(1s run duration, 5 runs per benchmark, release mode)
test name before after improvement
one_row 49.070ns 48.287ns 1.60%
single_active 61.574us 61.235us 0.55%
many_overlapping 488.193us 514.977us -5.49%
disjoint_interleaved 57.462us 57.111us 0.61%
disjoint_ranges 56.545us 56.006us 0.95%
overlapping_partitions_disjoint_rows 127.039us 80.849us 36.36%
Same results, normalized per mutation fragment:
test name before after improvement
one_row 16.36ns 16.10ns 1.60%
single_active 109.46ns 108.86ns 0.55%
many_overlapping 216.97ns 228.88ns -5.49%
disjoint_interleaved 102.15ns 101.53ns 0.61%
disjoint_ranges 100.52ns 99.57ns 0.95%
overlapping_partitions_disjoint_rows 246.38ns 156.80ns 36.36%
Tested on AMD Ryzen Threadripper 2950X @ 3.5GHz.
In case when a single reader contributes a stream of fragments
and keeps winning over other readers, mutation_reader_merger will
enter gallop mode, in which it is assumed that the reader will keep
winning over other readers. Currently, a reader needs to contribute
3 fragments to enter that mode.
In gallop mode, fragments returned by the galloping reader will be
compared with the best fragment from _fragment_heap. If it wins, the
fragment is directly returned. Otherwise, gallop mode ends and
merging performed as in general case, which involves heap operations.
In current implementation, when the end of partition is encountered
while in gallop mode, the gallop mode is ended unconditionally.
Fixes#3593.
Move out logic responsible for adding readers at partition boundary
into `maybe_add_readers_at_partition_boundary`, and advancing one reader
into `prepare_one`. This will allow to reuse this logic outside
`prepare_next`.
Since seastar::streams are based on future/promise, variadic streams
suffer the same fate as variadic futures - deprecation and eventual
removal.
This patch therefore replaces a variadic stream in commitlog::read_log_file()
with a non-variadic stream, via a helper struct.
Tests: unit (dev)
Recently, scylla memory started to go beyond just providing raw stats
about the occupancy of the various memory pools, to additionally also
provide an overview of the "usual suspects" that cause memory pressure.
As part of this, recently 46341bd63f
added a section of the coordinator stats. This patch continues this
trend and adds a replica section, with the "usual suspects":
* read concurrency semaphores
* execution stages
* read/write operations
Example:
Replica:
Read Concurrency Semaphores:
user sstable reads: 0/100, remaining mem: 84347453 B, queued: 0
streaming sstable reads: 0/ 10, remaining mem: 84347453 B, queued: 0
system sstable reads: 0/ 10, remaining mem: 84347453 B, queued: 0
Execution Stages:
data query stage:
03 "service_level_sg_0" 4967
Total 4967
mutation query stage:
Total 0
apply stage:
03 "service_level_sg_0" 12608
06 "statement" 3509
Total 16117
Tables - Ongoing Operations:
pending writes phaser (top 10):
2 ks.table1
2 Total (all)
pending reads phaser (top 10):
3380 ks.table2
898 ks.table1
410 ks.table3
262 ks.table4
17 ks.table8
2 system_auth.roles
4969 Total (all)
pending streams phaser (top 10):
0 Total (all)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191029164817.99865-1-bdenes@scylladb.com>
This patch adds the following per table stats:
cas_prepare_latency
cas_propose_latency
cas_commit_latency
They are equivalent to CasPropose, CasPrepare, CasCommit metrics exposed
by Cassandra.
This patch implements accounting of Cassandra's metrics related to
lightweight transactions, namely:
cas_read_latency transactional read latency (histogram)
cas_write_latency transactional write latency (histogram)
cas_read_timeouts number of transactional read timeouts
cas_write_timeouts number of transactional write timeouts
cas_read_unavailable number of transactional read
unavailable errors
cas_write_unavailable number of transactional write
unavailable errors
cas_read_unfinished_commit number of transaction commit attempts
that occurred on read
cas_write_unfinished_commit number of transaction commit attempts
that occurred on write
cas_write_condition_not_met number of transaction preconditions
that did not match current values
cas_read_contention how many contended reads were
encountered (histogram)
cas_write_contention how many contended writes were
encountered (histogram)
Pass contention by reference to begin_and_repair_paxos(), where it is
incremented on every sleep. Rationale: we want to account the total
number of times query() / cas() had to sleep, either directly or within
begin_and_repair_paxos(), no matter if the function failed or succeeded.
Even though every Scylla version has its own scylla-gdb.py, because we
don't backport any fixes or improvements, practically we end up always
using master's version when debugging older versions of Scylla too. This
is made harder by the fact that both Scylla's and its dependencies'
(most notably that of libstdc++ and boost) code is constantly changing
between releases, requiring edits to scylla-gdb.py to make it usable
with past releases.
This patch attempts to make it easier to use scylla-gdb.py with past
releases, more specifically Scylla 3.0. This is achieved by wrapping
problematic lines in a `try: except:` and putting the backward
compatible version in the `except:` clause. These lines have comments
with the version they provide support for, so they can be removed when
said version is not supported anymore.
I did not attempt to provide full coverage, I only fixed up problems
that surfaced when using my favourite commands with 3.0.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191029155737.94456-1-bdenes@scylladb.com>
The loop that collects the result of the checksum calculations and logs
any errors. The error logging includes `checksums[0]` which corresponds
to the checksum calculation on the local node. This violates the
assumption of the code following the loop, which assumes that the future
of `checksums[0]` is intact after the loop terminates. However this is
only true when the checksum calculation is successful and is false when
it fails, as in this case the loop extracts the error and logs it. When
the code after the loop checks again whether said calculation failed, it
will get a false negative and will go ahead and attempt to extract the
value, triggering an assert failure.
Fix by making sure that even in the case of failed checksum calculation,
the result of `checksum[0]` is extracted only once.
Fixes: #5238
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191029151709.90986-1-bdenes@scylladb.com>
* seastar 2963970f6b...75e189c6ba (7):
> posix-stack: Do auto-resolve of ipv6 scope iff not set for link-local dests
> README.md: Add redpanda and smf to 'Projects using Seastar'
> unix_domain_test: don't assume that at temporary_buffer is null terminated
> socket_address: Use offsetof instead of null pointer
> README: add projects using seastar section to readme
> Adjustments for glibc 2.30 and hwloc 2.0
> Mark future::failed() as const
We may want to change paxos tables format and change internode protocol,
so hide lwt behind experimental flag for now.
Message-Id: <20191029102725.GM2866@scylladb.com>
Currently end of stream validation is done in the destructor,
but the validator may be destructed prematurely, e.g. on
exception, as seen in https://github.com/scylladb/scylla/issues/5215
This patch adds a on_end_of_stream() method explicitly called by
consume_pausable_in_thread. Also, the respective concepts for
ParitionFilter, MutationFragmentFilter and a new on for the
on_end_of_stream method were unified as FlattenedConsumerFilter.
Refs #5215
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
(cherry picked from commit 506ff40bd447f00158c24859819d4bb06436c996)
There are a few issues at the CQL layer, because of which the result of
a CAS request execution may differ between Scylla and Cassandra. Mostly,
it happens when static columns are involved. The goal of this patch set
is to fix these issues, thus making Scylla's implementation of CAS yield
the same results as Cassandra's.
Merged patch series by Calle Wilund, with a few fixes by Piotr Jastrzębski:
Adds delta and pre-image data column writes for the atomic columns in a
cdc-enabled table.
Note that in this patch set it is still unconditional. Adding option support
comes in next set.
Uses code more or less derived from alternator to select pre-image, using
raw query interface. So should be fairly low overhead to query generation.
Pre-image and delta mutations are mixed in with the actual modification
mutations to generate the full cdc log (sans post-image).
Even if no rows match clustering key restrictions of a conditional
statement with static columns conditions, we still must include the
static column value into the CAS failure result set. For example,
the following conditional DELETE statement
create table t(k int, c int, s int static, v int, primary key(k, c));
insert into t(k, s) values(1, 1);
delete v from t where k=1 and c=1 if v=1 and s=1;
must return
[applied=False, v=null, s=1]
not just
[applied=False, v=null, s=null]
To fix that, set partition_slice::option::always_return_static_content
for querying rows used for checking conditions so that we have the
static row in update_parameters::prefetch_data even if no regular row
matches clustering column restrictions. Plus modify cas_request::
applies_to() so that it sets is_in_cas_result_set flag for the static
row in case there are static column conditions, but the result set
happens to be empty.
As pointed out by Tomek, there's another reason to set partition_slice::
option::always_return_static_content apart from building a correct
result set on CAS failure. There could be a batch with two statements,
one with clustering key restrictions which select no row, and another
statement with only static column conditions. If we didn't enable this
flag, we wouldn't get a static row even if it exists, and static column
conditions would evaluate as if the static row didn't exist, for
example, the following batch
create table t(k int, c int, s int static, primary key(k, c));
insert into t(k, s) values(1, 1);
begin batch
insert into t(k, c) values(1, 1) if not exists
update t set s = 2 where k = 1 if s = 1
apply batch;
would fail although it clearly must succeed.
A SELECT statement that has clustering key restrictions isn't supposed
to return static content if no regular rows matches the restrictions,
see #589. However, for the CAS statement we do need to return static
content on failure so this patch adds a flag that allows the caller to
override this behavior.
Apart from conditional statements, there may be other reading statements
in a batch, e.g. manipulating lists. We must not include rows fetched
for them into the CAS result set. For instance, the following CAS batch:
create table t(p int, c int, i int, l list<int>, primary key(p, c));
insert into t(p, c, i) values(1, 1, 1)
insert into t(p, c, i, l) values(1, 1, 1, [1, 2, 3])
begin batch
update t set i=3 where p=1 and c=1 if i=2
update t set l=l-[2] where p=1 and c=2
apply batch;
is supposed to return
[applied] | p | c | i
----------+---+---+---
False | 1 | 1 | 1
not
[applied] | p | c | i
----------+---+---+---
False | 1 | 1 | 1
False | 1 | 2 | 1
To filter out such collateral rows from the result set, let's mark rows
checked by conditional statements with a special flag.
If a CQL statement only updates static columns, i.e. has no clustering
key restrictions, we still fetch a regular row so that we can check it
against EXISTS condition. In this case we must be especially careful: we
can't simply pass the row to modification_statement::applies_to, because
it may turn out that the row has no static columns set, i.e. there's no
in fact static row in the partition. So we filter out such rows without
static columns right in cas_request::applies_to before passing them
further to modification_statement::applies_to.
Example:
create table t(p int, c int, s int static, primary key(p, c));
insert into t(p, c) values(1, 1);
insert into t(p, s) values(1, 1) if not exists;
The conditional statement must succeed in this case.
In case a CQL statement has only static columns conditions, we must
ignore clustering key restrictions.
Example:
create table t(p int, c int, s int static, v int, primary key(p, c));
insert into t(p, s) values(1, 1);
update t set v=1 where p=1 and c=1 if s=1;
This conditional statement must successfully insert row (p=1, c=1, v=1)
into the table even though there's no regular row with p=1 and c=1 in
the table before it's executed, because the statement condition only
applies to the static column s, which exists and matches.
If a modification statement doesn't have a clustering column restriction
while the table has static columns, then EXISTS condition just needs to
check if there's a static row in the partition, i.e. it doesn't need to
select any regular rows. Let's treat such EXIST condition like a static
column condition so that we can ignore its clustering key range while
checking CAS conditions.
This will allow us to add helper methods and store extra info in each
row. For example, we can add a method for checking if a row has static
columns. Also, to build CAS result set, we need to differentiate rows
fetched to check conditions from those fetched for reading operations.
Using struct as row container will allow us to store this information in
each prefetched row.
Currently, we set _sets_regular_columns/_sets_static_columns flags when
adding regular/static conditions to modification_statement. We use them
in applies_only_to_static_columns() function that returns true iff
_sets_static_columns is set and _sets_regular_columns is clear. We
assume that if this function returns true then the statement only deals
with static columns and so must not have clustering key restrictions.
Usually, that's true, but there's one exception: DELETE FROM ...
statement that deletes whole rows. Technically, this statement doesn't
have any column operations, i.e. _sets_regular_columns flag is clear.
So if such a statement happens to have a static condition, we will
assume that it only applies to static columns and mistakenly raise an
error.
Example:
create table t(k int, c int, s int static, v int, primary key(k, c));
delete from t where k=1 and c=1 if s=1;
To fix this, let's not set the above mentioned flags when adding
conditions and instead check if _column_conditions array is empty in
applies_only_to_static_columns().
modification_statement::process_where_clause() assumes that both
operations and conditions has been added to the statement when it's
called: it uses this information to raise an error in case the statement
restrictions are incompatible with operations or conditions. Currently,
operations are set before this function is called, but not conditions.
This results in "Invalid restrictions on clustering columns since
the {} statement modifies only static columns" error while trying to
execute the following statements:
create table t(k int, c int, s int static, v int, primary key(k, c));
delete s from t where k=1 and c=1 if v=1;
update t set s=1 where k=1 and c=1 if v=1;
Fix this by always initializing conditions before processing WHERE
clause.
Print a histogram of the number of async work items in the shard's
outgoing smp queues.
Example:
(gdb) scylla smp-queues
10747 17 -> 3 ++++++++++++++++++++++++++++++++++++++++
721 17 -> 19 ++
247 17 -> 20 +
233 17 -> 10 +
210 17 -> 14 +
205 17 -> 4 +
204 17 -> 5 +
198 17 -> 16 +
197 17 -> 6 +
189 17 -> 11 +
181 17 -> 1 +
179 17 -> 13 +
176 17 -> 2 +
173 17 -> 0 +
163 17 -> 8 +
1 17 -> 9 +
Useful for identifying the target shard, when `scylla task_histogram`
indicates a high number of async work items.
To produce the histogram the command goes over all virtual objects in
memory and identifies the source and target queues of each
`seastar::smp_message_queue::async_work_item` object. Practically the
source queue will always be that of the current shard. As this scales
with the number of virtual objects in memory, it can take some time to
run. An alternative implementation would be to instead read the actual
smp queues, but the code of that is scary so I went for the simpler and
more reliable solution.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191028132456.37796-1-bdenes@scylladb.com>
This patch set introduces light-weight transactions support to
ScyllaDB. It is a subset of the full series, which adds
basic LWT support and which has been reviewed thus far.
"
mutation_test/test_udt_mutations kept failing on my machine and I tracked it down to the 3rd patch in this series (use int64_t constants for long_type). While at it, this series also fixes a comment and the end iterator in BOOST_REQUIRE(std::all_of(...))
mutation_test: test_udt_mutations: fixup udt comment
mutation_test: test_udt_mutations: fix end iterator in call to std::all_of
mutation_test: test_udt_mutations: use int64_t constants for long_type
Test: mutation_test(dev, debug)
"
* 'test_udt_mutations-fixes' of https://github.com/bhalevy/scylla:
mutation_test: test_udt_mutations: use int64_t constants for long_type
mutation_test: test_udt_mutations: fix end iterator in call to std::all_of
mutation_test: test_udt_mutations: fixup udt comment
Based on a mutation, creates a pre-image select operation.
Note, this uses raw proxy query to shortcut parsing etc,
instead of trying to cache by generated query. Hypothesis is that
this is essentially faster.
The routine assumes all rows in a mutation touch same static/regular
columns. If this is not always true it will need additional
calculations.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Support single-statement conditional updates and as well as batches.
This patch almost fully rewrites column_condition.cc, implementing
is_satisfied_by().
Most of the remaining complications in column_condition implementation
come from the need to properly handle frozen and multi-cell
collection in predicates - up until now it was not possible
to compare entire collection values between each other. This is further
complicated since multi-cell lists and sets are returned as maps.
We can no longer assume that the columns fetched by prefetch operation
are non-frozen collections. IF EXISTS/IF NOT EXISTS condition
fetches all columns, besides, a column may be needed to check other
condition.
When fetching the old row for LWT or to apply updates on list/columns,
we now calculate precisely the list of columns to fetch.
The primary key columns are also included in CAS batch result set,
and are thus also prefetched (the user needs them to figure out which
statements failed to apply).
The patch is cross-checked for compatibility with cassandra-3.11.4-1545-g86812fa502
but does deviate from the origin in handling of conditions on static
row cells. This is addressed in future series.
Each column_condition and raw::column_condition construction case had a
static method wrapping its constructor, simply supplying some defaults.
This neither improves clarity nor maintainability.
cql_statement_opt_metadata is an interim node
in cql (prepared) statement hierarchy parenting
modification_statement and batch_statement. If there
is IF condition in such statements, they return a result set,
and thus have a result set metadata.
The metadata itself is filled in a subsequent patch.
Add checks for conditional modification statement limitations:
- WHERE clustering_key IN (list) IF condition is not supported
since a conditions is evaluated for a single row/cell, so
allowing multiple rows to match the WHERE clause would create
ambiguity,
- the same is true for conditional range deletions.
- ensure all clustering restrictions are eq for conditional delete
We must not allow statements like
create table t(p int, c int, v int, primary key (p, c));
delete from t where p=1 and c>0 if v=1;
because there may be more than one statement in a partition satisfying
WHERE clause, in which case it's unclear which of them should satisfy
IF condition: all or just one.
Raising an error on such a statement is consistent with Cassandra's
behavior.
Introduce service::cas_request abstract base class
which can be used to parameterize Paxos logic.
Implement storage_proxy::cas() - compare and swap - the storage proxy
entry point for lightweight transactions.
Currently the code that manipulates mutations during write need to
check what kind of mutations are those and (sometimes) choose different
code paths. This patch encapsulates the differences in virtual
functions of mutation_holder object, so that high level code will not
concern itself with the details. The functions that are added:
apply_locally(), apply_remotely() and store_hint().
This patch adds all functionality needed for Paxos protocol. The
implementation does not strictly adhere to Paxos paper since the original
paper allows setting a value only once, while for LWT we need to be able
to make another Paxos round after "learn" phase completes, which requires
things like repair to be introduced.
Paxos protocol has three stages: prepare, accept, learn. This patch adds
rpc verb for each of those stages. To be term compatible with Cassandra
the patch calls those stages: prepare, propose, commit.
Paxos protocol relies on replicas having a state that persists over
crashes/restarts. This patch defines such state and stores it in the
database itself in the paxos table to make it persistent.
The stored state is:
in_progress_ballot - promised ballot
proposal - accepted value
proposal_ballot - the ballot of the accepted value
most_recent_commit - most recently learned value
most_recent_commit_at - the ballot of the most recently learned value
This patch add two data structures that will be used by paxos. First
one is "proposal" which contains a ballot and a mutation representing
a value paxos protocol is trying to set. Second one is
"prepare_response" which is a value returned by paxos prepare stage.
It contains currently accepted value (if any) and most recently
learned value (again if any). The later is used to "repair" replicas
that missed previous "learn" message.
Otherwise they are decomposed and serialized as 4-byte int32.
For example, on my machine cell[1] looked like this:
{0002, atomic_cell{0000000310600000;ts=0;expiry=-1,ttl=0}}
and it failed cells_equal against:
{0002, atomic_cell{0000000300000000;ts=0;expiry=-1,ttl=0}}
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
server::set_routes() was setting the value of server::_callbacks.
This led to a race condition, as set_routes() is invoked on every
shard simultaneously. It is also unnecessary, since _callbacks can be
initialized in the constructor.
Fixes#5220.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
"
Introduce the traced_file class which wraps a file, adding CQL trace messages before and after every operation that returns a future.
Use this file to trace reads from SSTable data and index files.
Fixes#4908.
"
* 'traced_file' of https://github.com/kbr-/scylla:
sstables: report sstable index file I/O in CQL tracing
sstables: report sstable data file I/O in CQL tracing
tracing: add traced_file class
"
This change allows creating tables with non-frozen UDT columns. Such columns can then have single fields modified or deleted.
I had to do some refactoring first. Please read the initial commit messages, they are pretty descriptive of what happened (read the commits in the order they are listed on my branch: https://github.com/kbr-/scylla/commits/udt, starting from kbr-@8eee36e, in order to understand them). I also wrote a bunch of documentation in the code.
Fixes#2201.
"
* 'udt' of https://github.com/kbr-/scylla: (64 commits)
tests: too many UDT fields check test
collection_mutation: add a FIXME.
tests: add a non-frozen UDT materialized view test
tests: add a UDT mutation test.
tests: add a non-frozen UDT "JSON INSERT" test.
tests: add a non-frozen UDT to for_each_schema_change.
tests: more non-frozen UDT tests.
tests: move some UDT tests from cql_query_test.cc to new file.
types: handle trailing nulls in tuples/UDTs better.
cql3: enable deleting single fields of non-frozen UDTs.
cql3: enable setting single fields of a non-frozen UDT.
cql3: enable non-frozen UDTs.
cql3: introduce user_types::marker.
cql3: generalize function_call::make_terminal to UDTs.
cql3: generalize insert_prepared_json_statement::execute_set_value to UDTs.
cql3: use a dedicated setter operation for inserting user types.
cql3: introduce user_types::value.
types: introduce to_bytes_opt_vec function.
cql3: make user_types::delayed_value::bind_internal return vector<bytes_opt>.
cql3: make cql3_type::raw_ut::to_string distinguish frozenness.
...
The health check is performed simply by issuing a GET request
to the alternator port - it returns the following status 200
response when the server is healthy:
$ curl -i localhost:8000
HTTP/1.1 200 OK
Content-Type: text/plain
Content-Length: 23
Server: Seastar httpd
Date: 21 Oct 2019 12:55:33 GMT
healthy: localhost:8000
This commit comes with a test.
Fixes#5050
Message-Id: <3050b3819661ee19640c78372e655470c1e1089c.1571921618.git.sarna@scylladb.com>
We could use iterators over cells instead of a vector of cells
in collection_mutation(_view)_description. Then some use cases could
provide iterators that construct the cells "on the fly".
Comparing user types after adding new fields was bugged.
In the following scenario:
create type ut (a int);
create table cf (a int primary key, b frozen<ut>);
insert into cf (a, b) values (0, (0));
alter type ut add b int;
select * from cf where b = {a:0,b:null};
the row with a = 0 should be returned, even though the value stored in the database is shorter
(by one null) than the value given by the user. Until now it wouldn't
have.
Add a cluster feature for non-frozen UDTs.
If the cluster supports non-frozen UDTs, do not return an error
message when trying to create a table with a non-frozen user type.
cql3::user_types::marker is a dedicated cql3::abstract_marker for user
type placeholders in prepared CQL queries. When bound, it returns a
user_types::value.
Previously it returned vector<cql3::raw_value>, even though we don't use
unset values when setting a UDT value (fields that are not provided
become nulls. Thats how C* does it).
This simplifies future implementation of user_types::{value, setter}.
is_value_compatible_with_internal and update_user_type were generalized
to the non-frozen case.
For now, all user_type_impls in the code are non-multi-cell (frozen).
This will be changed in future commits.
These functions are used to translate field indices, which are used to
identify fields inside UDTs, from/to a serialized representation to be
stored inside sstables and mutations.
They do it in a way that is compatible with C*.
The purpose of collection_type_impl::to_value was to serialize a
collection for sending over CQL. The corresponding function in origin
is called serializeForNativeProtocol, but the name is a bit lengthy,
so I settled for serialize_for_cql.
The method now became a free-standing function, using the visit
function to perform a dispatch on the collection type instead
of a virtual call. This also makes it easier to generalize it to UDTs
in future commits.
Remove the old serialize_for_native_protocol with a FIXME: implement
inside. It was already implemented (to_value), just called differently.
remove dead methods: enforce_limit and serialized_values. The
corresponding methods in C* are auxiliary methods used inside
serializeForNativeProtocol. In our case, the entire algorithm
is wholly written in serialize_for_cql.
`collection_type_impl::serialize_mutation_form`
became `collection_mutation(_view)_description::serialize`.
Previously callers had to cast their data_type down to collection_type
to use serialize_mutation_form. Now it's done inside `serialize`.
In the future `serialize` will be generalized to handle UDTs.
`collection_type_impl::deserialize_mutation_form`
became a free standing function `deserialize_collection_mutation`
with similiar benefits. Actually, noone needs to call this function
manually because of the next paragraph.
A common pattern consisting of linearizing data inside a `collection_mutation_view`
followed by calling `deserialize_mutation_form` has been abstracted out
as a `with_deserialized` method inside collection_mutation_view.
serialize_mutation_form_only_live was removed,
because it hadn't been used anywhere.
collection_type_impl::mutation became collection_mutation_description.
collection_type_impl::mutation_view became collection_mutation_view_description.
These classes now reside inside collection_mutation.hh.
Additional documentation has been written for these classes.
Related function implementations were moved to collection_mutation.cc.
This makes it easier to generalize these classes to non-frozen UDTs in future commits.
The new names (together with documentation) better describe their purpose.
The classes 'collection_mutation' and 'collection_mutation_view'
were moved to a separate header, collection_mutation.hh.
Implementations of functions that operate on these classes,
including some methods of collection_type_impl, were moved
to a separate compilation unit, collection_mutation.cc.
This makes it easier to modify these structures in future commits
in order to generalize them for non-frozen User Defined Types.
Some additional documentation has been written for collection_mutation.
Similar to "gossip: Limit number of pending gossip ACK messages", limit
the number of pending gossip ACK2 messages in gossiper::handle_ack_msg.
Fixes#5210
In a cross-dc large cluster, the receiver node of the gossip SYN message
might be slow to send the gossip ACK message. The ack messages can be
large if the payload of the application state is big, e.g.,
CACHE_HITRATES with a lot of tables. As a result, the unlimited ACK
message can consume unlimited amount of memory which causes OOM
eventually.
To fix, this patch queues the SYN message and handles it later if the
previous ACK message is still being sent. However, we only store the
latest SYN message. Since the latest SYN message from peer has the
latest information, so it is safe to drop the previous SYN message and
keep the latest one only. After this patch, there can be at most 1
pending SYN message and 1 pending ACK message per peer node.
Fixes#5210
Merged patch series from Piotr Sarna:
This series couples system_auth.roles with authorization routines
in alternator. The `salted_hash` field, which is every user's hashed
password, is used as a secret key for the signature generation
in alternator.
This series also adds related expiration verifications for alternator
signatures.
It also comes with more test cases and docs updates.
Tests: alternator(local, remote), manual
Piotr Sarna (11):
alternator: add extracting key from system_auth.roles
alternator: futurize verify_signature function
alternator: move the api handler to a separate function
alternator: use keys from system_auth.roles for authorization
alternator: add key cache to authorization
alternator-test: add a wrong password test
alternator: verify that the signature has not expired
alternator: add additional datestamp verification
alternator-test: add tests for expired signatures
docs: update alternator entry for authorization
alternator-test: add authorization to README
alternator-test/conftest.py | 2 +-
alternator-test/test_authorization.py | 44 ++++++++-
alternator-test/test_describe_endpoints.py | 2 +-
alternator/auth.hh | 15 ++-
alternator/server.hh | 10 +-
alternator/auth.cc | 62 +++++++++++-
alternator/server.cc | 106 ++++++++++++---------
alternator-test/README.md | 28 ++++++
docs/alternator/alternator.md | 7 +-
9 files changed, 221 insertions(+), 55 deletions(-)
ommit 93270dd changed gc_clock to be 64-bit, to fix the Y2038
problem. While 64-bit tombstone::deletion_time is serialized in a
compatible way, TTLs (gc_clock::duration) were not.
This patchset reverts TTL serialization to the 32-bit serialization
format, and also allows opting-in to the 64-bit format in case a
cluster was installed with the broken code. Only Scylla 3.1.0 is
vulnerable.
Fixes#4855
Tests: unit (dev)
From Shlomi:
4 node cluster Node A, B, C, D (Node A: seed)
cassandra-stress write n=10000000 -pop seq=1..10000000 -node <seed-node>
cassandra-stress read duration=10h -pop seq=1..10000000 -node <seed-node>
while read is progressing
Node D: nodetool decommission
Node A: nodetool status node - wait for UL
Node A: nodetool cleanup (while decommission progresses)
I get the error on c-s once decommission ends
java.io.IOException: Operation x0 on key(s) [383633374d31504b5030]: Data returned was not validated
The problem is when a node gets new ranges, e.g, the bootstrapping node, the
existing nodes after a node is removed or decommissioned, nodetool cleanup will
remove data within the new ranges which the node just gets from other nodes.
To fix, we should reject the nodetool cleanup when there is pending ranges on that node.
Note, rejecting nodetool cleanup is not a full protection because new ranges
can be assigned to the node while cleanup is still in progress. However, it is
a good start to reject until we have full protection solution.
Refs: #5045
Scylla 3.1.0 broke the serialization format for TTLs. Later versions
corrected it, but if a cluster was originally installed as 3.1.0,
it will use the broken serialization forever. This configuration option
allows upgrades from 3.1.0 to succeed, by enabling the broken format
even for later versions.
Scylla 3.1.0 inadvertently changed the serialization format of TTLs
(internally represented as gc_clock::duration) from 32-bit to 64-bit,
as part of preparation for Y2038 (which comes earlier for TTLed cells).
This breaks mutations transported in a mixed cluster.
To fix this, we revert back to the 32-bit format, unless we're in a 3.1.0-
heritage cluster, in which case we use the 64-bit format. Overflow of
a TTL is not a concern, since TTLs are capped to 20 years by the TTL layer.
An assertion is added to verify this.
This patch only defines a variable to indicate we're in
a 3.1.0 heritage cluster, but a way to set it is left to
a later patch.
* seastar 6bcb17c964...2963970f6b (4):
> Merge "IPv6 scope support and network interface impl" from Calle
> noncopyable_function: do not copy uninitialized data
> Merge "Move smp and smp queue out of reactor" from Asias
> Consolidate posix socket implementations
The README paragraph informs about turning on authorization with:
alternator-enforce-authorization: true
and has a short note on how to set up the secret key for tests.
The first test case ensures that expired signatures are not accepted,
while the second one checks that signatures with dates that reach out
too far into the future are also refused.
The authorization signature contains both a full obligatory date header
and a shortened datestamp - an additional verification step ensures that
the shortened stamp matches the full date.
AWS signatures have a 15min expiration policy. For compatibility,
the same policy is applied for alternator requests. The policy also
ensures that signatures expanding more than 15 minutes into the future
are treated as unsafe and thus not accepted.
The additional test case submits a request as a user that is expected
to exist (in the local setup), but the provided password is incorrect.
It also updates test_wrong_key_access so it uses an empty string
for trying to authenticate as an inexistent user - in order to cover
more corner cases.
In order to avoid fetching keys from system_auth.roles system table
on every request, a cache layer is introduced. And in order not to
reinvent the wheel, the existing implementation of loading_cache
with max size 1024 and a 1 minute timeout is used.
Instead of having a hardcoded secret key, the server now verifies
an actual key extracted from system_auth.roles system table.
This commit comes with a test update - instead of 'whatever':'whatever',
the credentials used for a local run are 'alternator':'secret_pass',
which matches the initial contents of system_auth.roles table,
which acts as a key store.
Fixes#5046
The lambda used for handling the api request has grown a little bit
too large, so it's moved to a separate method. Along with it,
the callbacks are now remembered inside the class itself.
The verify_signature utility will later be coupled with Scylla
authorization. In order to prepare for that, it is first transformed
into a function that returns future<>, and it also becomes a member
of class server. The reason it becoming a member function is that
it will make it easier to implement a server-local key cache.
As a first step towards coupling alternator authorization with Scylla
authorization, a helper function for extracting the key (salted_hash)
belonging to the user is added.
From Shlomi:
4 node cluster Node A, B, C, D (Node A: seed)
cassandra-stress write n=10000000 -pop seq=1..10000000 -node <seed-node>
cassandra-stress read duration=10h -pop seq=1..10000000 -node <seed-node>
while read is progressing
Node D: nodetool decommission
Node A: nodetool status node - wait for UL
Node A: nodetool cleanup (while decommission progresses)
I get the error on c-s once decommission ends
java.io.IOException: Operation x0 on key(s) [383633374d31504b5030]: Data returned was not validated
The problem is when a node gets new ranges, e.g, the bootstrapping node, the
existing nodes after a node is removed or decommissioned, nodetool cleanup will
remove data within the new ranges which the node just gets from other nodes.
To fix, we should reject the nodetool cleanup when there is pending ranges on that node.
Note, rejecting nodetool cleanup is not a full protection because new ranges
can be assigned to the node while cleanup is still in progress. However, it is
a good start to reject until we have full protection solution.
Refs: #5045
Argument evaluation order is UB, so it's not guaranteed that
c->make_garbage_collected_sstable_writer() is called before
compaction is moved to run().
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20191023052647.9066-1-raphaelsc@scylladb.com>
Make it possible to compare multi-cell lists and sets serialized
as maps with literal values and serialize them to network using
a standard format (vector of values).
This is a pre-requisite patch for column condition evaluation
in light-weight transactions.
Merged patch series from Botond Dénes:
This series extends the existing docs/debugging.md with a detailed guide
on how to debug Scylla coredumps. The intended target audience is
developers who are debugging their first core, hence the level of
details (hopefully enough). That said this should be just as useful for
seasoned debuggers just quickly looking up some snippet they can't
remember exactly. A Throubleshooting chapter is also added in this
series for commonly-met problems.
I decided to create this guide after myself having struggled for more
than a day on just opening(!) a coredump that was produced on Ubuntu.
As my main source, I used the How-to-debug-a-coredump page from the
internal wiki which contains many useful information on debugging
coredumps, however I found it to be missing some crucial information, as
well being very terse, thus being primarily useful for experienced
debuggers who can fill in the blanks. The reason I'm not extending said
wiki page is that I think this information should not be hidden in some
internal wiki page. Also, docs/debugging.md now seems to be a much
better base for such a document. This document was started as a
comprehensive debugging manual for beginners (but not just).
You will notice that the information on how to debug cores from
CentOS/Redhat are quite sparse. This is because I have no experience
with such cores, so for now the respective chapters are just stubs. I
intend to complete them in the future after having gained the necessary
experience and knowledge, however those being in possession of said
knowledge are more then welcome to send a patch. :)
Botond Dénes (4):
docs/debugging.md: demote 'Starting GDB' and 'Using GDB'
docs/debugging.md: fix formatting issues
docs/debugging.md: add 'Debugging coredumps' subchapter
docs/debugging.md: add 'Throubleshooting' subchapter
docs/debugging.md | 240 +++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 228 insertions(+), 12 deletions(-)
Add lua as a dependency in preparation for UDF. This is the first
patch since it has to go in before to allow for a frozen toolchain
update.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
[avi: update frozen toolchain image]
Message-Id: <20191018231442.11864-2-espindola@scylladb.com>
Multi-cell lists and maps may be stored in different formats: as sorted
vectors of pairs of values, when retreived from storage, or as sorted
vectors of values, when created from parser literals or supplied as
parameter values.
Implement a specialized compare for use when receiver and paramter
representation don't match.
Add helpers.
The problem is that backlog tracker is not being updated properly after
incremental compaction.
When replacing sstables earlier, we tell backlog tracker that we're done
with exhausted sstables[1], but we *don't* tell it about the new, sealed
sstables created that will replace the exhausted ones.
[1]: exhausted sstable is one that can be replaced earlier by compaction.
We need to notify backlog tracker about every sstable replacement which
was triggered by incremental compaction.
Otherwise, backlog for a table that enables incremental compaction will
be lower than it actually should. That's because new sstables being
tracked as partial decrease the backlog, whereas the exhausted ones
increase it.
The formula for a table's backlog is basically:
backlog(sstable set + compacting(1) - partial(2))
(1) compacting includes all compaction's input sstables, but the
exhausted ones are removed from it (correct behavior).
(2) partial includes all compaction's output sstables, but the ones
that replaced the exhausted sstables aren't removed from it (incorrect
behavior).
This problem is fixed by making backlog track *fully* aware of the early
replacement, not only the exhausted sstables, but also the new sstables
that replaced the exhausted ones. The new sstables need to be moved
inside the tracker from partial state to the regular one.
Fixes#5157.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20191016002838.23811-1-raphaelsc@scylladb.com>
Rather than passing a pointer to a cql_stats member corresponding to
the statement type, pass a reference to a cql_stats object and use
statement_type, which is already stored in modification_statement, for
determining which counter to increment. This will allow us to account
conditional statements, which will have a separate set of counters,
right in modification_statement::execute() - all we'll need to do is
add the new counters and bump them in case execute_with_condition is
called.
While we are at it, remove extra inclusions from statement_type.hh so as
not to introduce any extra dependencies for cql_stats.hh users.
Message-Id: <20191022092258.GC21588@esperanza>
Merged patch series from Avi Kivity:
The static row can be rare: many tables don't have them, and tables
that do will often have mutations without them (if the static row
is rarely updated, it may be present in the cache and in readers,
but absent in memtable mutations). However, it always consumes ~100
bytes of memory, even if it not present, due to row's overhead.
Change it to be optional by allocating it as an external object rather
than inlined into mutation_partition. This adds overhead when the
static row is present (17 bytes for the reference, back reference,
and lsa allocator overhead).
perf_simple_query appears to marginally (2%) faster. Footprint is
reduced by ~9% for a cache entry, 12% in memtables. More details are
provided in the patch commitlog.
Tests: unit (debug)
Avi Kivity (4):
managed_ref: add get() accessor
managed_ref: add external_memory_usage()
mutation_partition: introduce lazy_row
mutation_partition: make static_row optional to reduce memory
footprint
cell_locking.hh | 2 +-
converting_mutation_partition_applier.hh | 4 +-
mutation_partition.hh | 284 ++++++++++++++++++++++-
partition_builder.hh | 4 +-
utils/managed_ref.hh | 12 +
flat_mutation_reader.cc | 2 +-
memtable.cc | 2 +-
mutation_partition.cc | 45 +++-
mutation_partition_serializer.cc | 2 +-
partition_version.cc | 4 +-
tests/multishard_mutation_query_test.cc | 2 +-
tests/mutation_source_test.cc | 2 +-
tests/mutation_test.cc | 12 +-
tests/sstable_mutation_test.cc | 10 +-
14 files changed, 355 insertions(+), 32 deletions(-)
"
The node startup code (in particular the functions storage_service::prepare_to_join and storage_service::join_token_ring) is complicated and hard to understand.
This patch set aims to simplify it at least a bit by removing some dead code, moving code around so it's easier to understand and adding some comments that explain what the code does.
I did it to help me prepare for implementing generation and gossiping of CDC streams.
"
* 'bootstrap-refactors' of https://github.com/kbr-/scylla:
storage_service: more comments in join_token_ring
db: remove system_keyspace::update_local_tokens
db: improve documentation for update_tokens and get_saved_tokens in system_keyspace
storage_service: remove storage_service::_is_bootstrap_mode.
storage_service: simplify storage_service::bootstrap method
storage_service: fix typo in handle_state_moving
storage_service: remove unnecessary use of stringstream
storage_service: remove redundant call to update_tokens during join_token_ring
storage_service: remove storage_service::set_tokens method.
storage_service: remove is_survey_mode
storage_service::handle_state_normal: tokens_to_update* -> owned_tokens
storage_service::handle_state_normal: remove local_tokens_to_remove
db::system_keyspace::update_tokens: take tokens by const ref
db::system_keyspace::prepare_tokens: make static, take tokens by const ref
token_metadata::update_normal_tokens: take tokens by const ref
Assume n1 and n2 in a cluster with generation number g1, g2. The
cluster runs for more than 1 year (MAX_GENERATION_DIFFERENCE). When n1
reboots with generation g1' which is time based, n2 will see
g1' > g2 + MAX_GENERATION_DIFFERENCE and reject n1's gossip update.
To fix, check the generation drift with generation value this node would
get if this node were restarted.
This is a backport of CASSANDRA-10969.
Fixes#5164
The flag did nothing. It was used in one place to check if there's a
bug, but it can easily by proven by reading the code that the check
would never pass.
The storage_service::bootstrap method took a parameter: tokens to
bootstrap with. However, this method is only called in one place
(join_token_ring) with only one parameter: _bootstrap_tokens. It doesn't
make sense to call this method anywhere else with any other parameter.
This commit also adds a comment explaining what the method does and
moves it into the private section of storage_service.
When a non-seed node was bootstrapping, system_keyspace::update_tokens
was called twice: first right after the tokens were generated (or
received if we were replacing a different node) in the call to
`bootstrap`, and then later in join_token_ring. The second call was
redundant.
The join_token_ring call was also redundant if we were not bootstrapping
and had tokens saved previously (e.g. when restarting). In that case we
would have read them from LOCAL and then save the same tokens again.
This commit removes the redundant call and inserts calls to
update_tokens where they are necessary, when new tokens are generated.
The aim is to make the code easier to understand.
It also adds a comment which explains why the tokens don't need to be
generated in one of the cases.
After commit 36ccf72f3c, this method
was used only in one place.
Its name did not make it obvious what it does and when is it safe to call it.
This commit pulls out the code from set_tokens to the point where it was
called (join_token_ring). The code is only possible to understand in
context.
This code was also saving the tokens to the LOCAL table before
retrieving them from this table again. There is no point in doing that:
1. there are no races, since when join_token_ring is running, it is the
only function which can call system_keyspace::update_tokens (which saves them to the
LOCAL table). There can be no multiple instances of join_token_ring.
2. Even if there was a race, this wouldn't fix anything. The tokens we
retrieve from LOCAL by calling get_local_tokens().get0() could already
be different in the LOCAL table when the get0() returns.
Replace the two variables:
tokens_to_update_in_metadata
tokens_to_update_in_system_keyspace
which were exactly the same, with one variable owned_tokens.
The new name describes what the variable IS instead what's it used for.
Add a comment to clarify what "owned" means: those are the tokens the
node chose and any collision was resolved positively for this node.
Move the variable definition further down in the code, where it's
actually needed.
Merged patch series from Piotr Sarna:
Calculating the select statement for given view_info structure
used to work fine, but once local indexes were introduced, a subtle
bug appeared: the legacy token column does not exist in local indexes
and a valid clustering key column was omitted instead.
That results in potentially incorrect partition slices being used later
in read-before-write.
There's a long term plan for removing select_statement from
view info altogether, but nonetheless the bug needs to be fixed first.
Branch: master, 3.1
Tests: unit(dev) + manual confirmation that a correct legacy column is picked
Merge a patch series from Piotr Jastrzębski (haaawk):
This PR introduces CDC in it's minimal version.
It is possible now to create a table with CDC enabled or to enable/disable
CDC on existing table. There is a management of CDC log and description
related to enabling/disabling CDC for a table.
For now only primary key of the changed data is logged.
To be able to co-locate cdc streams with related base table partitions it
was needed to propagate the information about the number of shards per node.
This was node through gossip.
There is an assumption that all the nodes use the same value for
sharding_ignore_msb_bits. If it does not hold we would have to gossip
sharding_ignore_msb_bits around together with the number of shards.
Fixes#4986.
Tests: unit(dev, release, debug)
Currently, the function that generates the graph edges (and vertices)
with a breadth-first traversal of the object graph accidentally uses the
object that is the starting point of the graph as the "to" part of each
edge. This results in the graph having each of its edges point to the
starting point, as if all objects in it referenced said object directly.
Fix by using the object of the currently examined object.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191018113019.95093-1-bdenes@scylladb.com>
To the 'Debuggin Scylla with GDB` chapter. The '### Debugging
relocatable binaries built with the toolchain' subchapter is demoted to
be just a section in this new subchapter. It is also renamed to
'Relocatable binaries'.
This subchapter intends to be a complete guide on how to debug coredumps
from how to obtain the correct version of all the binaries all the way
to how to correctly open the core with GDB.
* seastar e888b1df...6bcb17c9 (4):
> iotune: don't crash in sequential read test if hitting EOF
> Remove FindBoost.cmake from install files
> Merge "Move reactor backend out of reactor" from Asias
> fair_queue: Add fair_queue.cc
This patch wrapps announce_migration logic into a lambda
that will be used both when cdc is used and when it's not.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
At the moment, this test only checks that table
creation and alteration sets cdc_options property
on a table correctly.
Future patches will extend this test to cover more
CDC aspects.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
We would like to share with other nodes
the value of ignore_msb_bits property used by the node.
This is needed because CDC will operate on
streams of changes. Each shard on each node
will have its own stream that will be identified
by a stream_id. Stream_id will be selected in
such a way that using stream_id as partition key
will locate partition identified by stream_id on
a node and shard that the stream belongs to.
To be able to generate such stream_id we need
to know ignore_msb_bits property value for each node.
IMPORTANT NOTE: At this point CDC does not support
topology changes. It will work only on a stable cluster.
Support for topology modifications will be added in
later steps.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
We would like to share with other nodes
the number of shards available at the node.
This is needed because CDC will operate on
streams of changes. Each shard on each node
will have its own stream that will be identified
by a stream_id. Stream_id will be selected in
such a way that using stream_id as partition key
will locate partition identified by stream_id on
a node and shard that the stream belongs to.
To be able to generate such stream_id we need
to know how many shards are on each node.
IMPORTANT NOTE: At this point CDC does not support
topology changes. It will work only on a stable cluster.
Support for topology modifications will be added in
later steps.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Refactor modification_statement to enable lightweight
transaction implementation.
This patch set re-arranges logic of
modification_statement::get_mutations() and uses
a column mask of identify the columns to prefetch.
It also pre-computes a few modification statement properties
at prepare, assuming the prepared statement is invalidated if
the underlying schema changes.
They are used more extensively with introduction of lightweight
transactions, and pre-computing makes it easier to reason about
complexity of the scenarios where they are involved.
Pre-compute column mask of columns to prefetch when preparing
a modification statement and use it to build partition_slice
object for read command. Fetch only the required columns.
Ligthweight transactions build up on this by using adding
columns used in conditions and in cas result set to the column
maks of columns to read. Batch statements unite all column
masks to build a single relation for all rows modified by
conditional statements of a batch.
Refactor get_mutations() so that the read command and
apply_updates() functions can be used in lightweight transactions.
Move read_command creation to an own method, as well as apply_updates().
Rewrite get_mutations() using the new API.
Avoid unnecessary shared pointers.
Introduce a column definition ordinal_id and use it in boosted
update_parameters::prefetch_data as a column index of a full row.
Lightweight transactions prefetch data and return a result set.
Make sure update_parameters::prefetch_data can serve as a
single representation of prefetched list cells as well as
condition cells and as a CAS result set.
I have a lot of plans for column_definition::ordinal_id, it
simplifies a lot of operations with columns and will also be
used for building a bitset of columns used in a query
or in multiple queries of a batch.
In modification_statement/batch_statement, we need to prefetch data to
1) apply list operations
2) evaluate CAS conditions
3) return CAS result set.
Boost update_parameters::prefetch_data to serve as a single result set
for all of the above. In case of a batch, store multiple rows for
multiple clustering keys involved in the batch.
Use an ordered set for columns and rows to make sure 3) CAS result set
is returned to the client in an ordered manner.
Deserialize the primary key and add it to result set rows since
it is returned to the client as part of CAS result set.
Index columns using ordinal_id - this allows having a single
set for all columns and makes columns easy to look up.
Remove an extra memcpy to build view objects when looking
up a cell by primary key, use partition_key/clustering_key
objects for lookup.
Get rid of an unnecessary optional around
update_parameters::prefetch_data.
update_parameters won't own prefetch_data in the future anyway,
since prefetch_data can be shared among multiple modification
statements of a batch, each statement having its own options
and hence its own update_parameters instance.
Move prefetch_data_builder class from modification_statement.cc
to update_parameters.cc.
We're going to share the same builder to build a result set
for condition evaluation and to apply updates of batch statements, so we
need to share it.
No other changes.
Make sure every column in the schema, be it a column of partition
key, clustering key, static or regular one, has a unique ordinal
identifier.
This makes it easy to compute the set of columns used in a query,
as well as index row cells.
Allow to get column definition in schema by ordinal id.
"
Incremental compaction code to release exhausted sstables was inefficient because
it was basically preventing any release from ever happening. So a new solution is
implemented to make incremental compaction approach actually efficient while
being cautious about not introducing data resurrection. This solution consists of
storing GC'able tombstones in a temporary sstable and keeping it till the end of
compaction. Overhead is avoided by not enabling it to strategies that don't work
with runs composed of multiple fragments.
Fixes#4531.
tests: unit, longevity 1TB for incremental compaction
"
* 'fix_incremental_compaction_efficiency/v6' of https://github.com/raphaelsc/scylla:
tests: Check that partition is not resurrected on compaction failure
tests: Add sstable compaction test for gc-only mutation compactor consumer
sstables: Fix Incremental Compaction Efficiency
Introduce `scylla generate_object_graph`, a command which generates a
visual object graph, where vertices are objects and edges are
references. The graph starts from the object specified by the user. The
graph allows visual inspection of the object graph and hopefully allows
the user to identify the object in question.
Add the `--resolve` flag to `scylla find`. When specified, `scylla find`
will attempt to resolve the first pointer in the found objects as a vtable
pointer. If successful the pointer as well as the resolved symbol will
be added to the listing.
In the listing of `scylla fiber` also print the starting task (as the
first item).
This mini-series contains assorted improvements that I found very useful
while debugging OOM crashes in the past weeks:
* A wrapper for `std::list`.
* A wrapper for `std::variant`.
* Making `scylla find` usable from python code.
* Improvements to `scylla sstables` and `scylla task_histogram`
commands.
* The `$downcast_vptr()` convenience function.
* The `$dereference_lw_shared_ptr()` convenience function.
Convenience functions in gdb are similar to commands, with some key
differences:
* They have a defined argument list.
* They can return values.
* They can be part of any gdb expression in which functions are allowed.
This makes them very useful for doing operations on values then
returning them so that the developer can use it the gdb shell.
The static row can be rare: many tables don't have them, and tables
that do will often have mutations without them (if the static row
is rarely updated, it may be present in the cache and in readers,
but absent in memtable mutations). However, it always consumes ~100
bytes of memory, even if it not present, due to row's overhead.
Change it to be optional by using lazy_row instead of row. Some call
sites treewide were adjusted to deal with the extra indirection.
perf_simple_query appears to improve by 2%, from 163krps to 165 krps,
though it's hard to be sure due to noisy measurements.
memory_footprint comparisons (before/after):
mutation footprint: mutation footprint:
- in cache: 1096 - in cache: 992
- in memtable: 854 - in memtable: 750
- in sstable: 351 - in sstable: 351
- frozen: 540 - frozen: 540
- canonical: 827 - canonical: 827
- query result: 342 - query result: 342
sizeof(cache_entry) = 112 sizeof(cache_entry) = 112
-- sizeof(decorated_key) = 36 -- sizeof(decorated_key) = 36
-- sizeof(cache_link_type) = 32 -- sizeof(cache_link_type) = 32
-- sizeof(mutation_partition) = 200 -- sizeof(mutation_partition) = 96
-- -- sizeof(_static_row) = 112 -- -- sizeof(_static_row) = 8
-- -- sizeof(_rows) = 24 -- -- sizeof(_rows) = 24
-- -- sizeof(_row_tombstones) = 40 -- -- sizeof(_row_tombstones) = 40
sizeof(rows_entry) = 232 sizeof(rows_entry) = 232
sizeof(lru_link_type) = 16 sizeof(lru_link_type) = 16
sizeof(deletable_row) = 168 sizeof(deletable_row) = 168
sizeof(row) = 112 sizeof(row) = 112
sizeof(atomic_cell_or_collection) = 8 sizeof(atomic_cell_or_collection) = 8
Tests: unit (dev)
lazy_row adds indirection to the row class, in order to reduce storage requirements
when the row is not present. The intent is to use it for the static row, which is
not present in many schemas, and is often not present in writes even in schemas that
have a static row.
Indirection is done using managed_ref, which is lsa-compatible.
lazy_row implements most of row's methods, and a few more:
- get(), get_existing(), and maybe_create(): bypass the abstraction and the
underlying row
- some methods that accept a row parameter also have an overload with a lazy_row
parameter
"Delete README-DPDK.md, move IDL.md to docs/ and fix
docs/review-checklist.md to point to scylla's coding style document,
instead of seastar's."
* 'documentation-cleanup/v3' of https://github.com/denesb/scylla:
docs/review-checklist.md: point to scylla's coding-style.md instead of seastar's
docs: mv coding-style.md docs/
rm README-DPDK.md
docs: mv IDL.md docs/
"Delete README-DPDK.md, move IDL.md to docs/ and fix
docs/review-checklist.md to point to scylla's coding style document,
instead of seastar's."
* 'documentation-cleanup/v3' of https://github.com/denesb/scylla:
docs/review-checklist.md: point to scylla's coding-style.md instead of seastar's
docs: mv coding-style.md docs/
rm README-DPDK.md
docs: mv IDL.md docs/
Allow returning fewer random clustering keys than requested since
the schema may limit the total number we can generate, for example,
if there is only one boolean clustering column.
Fixes#5161
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Calculating the select statement for given view_info structure
used to work fine, but once local indexes were introduced, a subtle
bug appeared: the legacy token column does not exist in local indexes
and a valid clustering key column was omitted instead.
That results in potentially incorrect partition slices being used later
in read-before-write.
There's a long term plan for removing select_statement from
view info altogether, but nonetheless the bug needs to be fixed first.
When investigating OOM:s a prominent pattern is a size class that is
exploded, using up most of the available memory alone. If one is lucky,
the objects causing the OOM are instances of some virtual class, making
their identification easy. Other times the objects are referenced by
instances of some virtual class, allowing their identification with some
work. However there are cases where neither these objects nor their
direct referrers are instances of virtual classes. This is the case
`scylla generate_object_graph` intends to help.
scylla generate_object_graph, like its name suggests generates the
object graph of the requested object. The object graph is a directed
graph, where vertices are objects and edges are references between them,
going from referrers to the referee. The vertices contain information,
like the address of the object, its size, whether it is a live or not
and if applies, the address and symbol name of its vtable. The edges
contain the list of offsets the referrer has references at. The
generated graph is an image, which allows the visual inspection of the
object graph, allowing the developer to notice patterns and hopefully
identify the problematic objects.
The graph is generated with the help of `graphwiz`. The command
generates `.dot` files which can be converted to images with the help of
the `dot` utility. The command can do this if the output file is one of
the supported image formats (e.g. `png`), otherwise only the `.dot` file
is generated, leaving the actual image generation to the user.
Add `--resolve` flag, which will make the command attempt to resolve the
first pointer of the found objects as a vtable pointer. If this is
successful the vtable pointer as well as the symbol name will be added
to the listing. This in particular makes backtracing continuation chains
a breeze, as the continuation object the searched one depends on can be
found at glance in the resulting listing (instead of having to manually
probe each item).
The arguments of `scylla find` are now parsed via `argparse`. While at
it, support for all the size classes supported by the underlying `find`
command were added, in addition to `w` and `g`. However the syntax of
specifying the size class to use has been changed, it now has to be
specified with the `-s|--size` command line argument, instead of passing
`-w` or `-g`.
Or in other words, the task that is the argument of the search. Example:
(gdb) scylla fiber 0x60001a305910
Starting task: (task*) 0x000060001a305910 0x0000000004aa5260 vtable for seastar::continuation<...> + 16
#0 (task*) 0x0000600016217c80 0x0000000004aa5288 vtable for seastar::continuation<...> + 16
#1 (task*) 0x000060000ac42940 0x0000000004aa2aa0 vtable for seastar::continuation<...> + 16
#2 (task*) 0x0000600023f59a50 0x0000000004ac1b30 vtable for seastar::continuation<...> + 16
This code is currently duplicated in `find_vptrs()` and
`scylla_task_histogram`. Refactor it out into a function.
The code is also improved in two ways:
* Make the search stricter, ensuring (hopefully) that indeed the
executable's text section is found, not that of the first object in
the `gdb file` listing.
* Throw an exception in the case when the search fails.
We don't want to add shared sstables to table's backlog tracker because:
1) table's backlog tracker has only an influence on regular compaction
2) shared sstables are never regular compacted, they're worked by
resharding which has its own backlog tracker.
Such sstables belong to more than one shard, meaning that currently
they're added to backlog tracker of all shards that own them.
But the thing is that such sstables ends up being resharded in shard
that may be completely random. So increasing backlog of all shards
such sstables belong to, won't lead to faster resharding. Also, table's
backlog tracker is supposed to deal only with regular compaction.
Accounting for shared sstables in table's tracker may lead to incorrect
speed up of regular compactions because the controller is not aware
that some relevant part of the backlog is due to pending resharding.
The fix is about ignoring sstables that will be resharded and let
table's backlog tracker account only for sstables that can be worked on
by regular compaction, and rely on resharding controlling itself
with its own tracker.
NOTE: this doesn't fix the resharding controlling issue completely,
as described in #4952. We'll still need to throttle regular compaction
on behalf of resharding. So subsequent work may be about:
- move resharding to its own priority class, perhaps streaming.
- make a resharding's backlog tracker accounts for sstables in all of
its pending jobs, not only the ongoing ones (currently limited to 1 by shard).
- limit compaction shares when resharding is in progress.
THIS only fixes the issue in which controller for regular compaction
shouldn't account sstables completely exclusive to resharding.
Fixes#5077.
Refs #4952.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20190924022109.17400-1-raphaelsc@scylladb.com>
Incremental compaction efficiency depends on the reference of sstables
compacted being all released because the file descriptors of sstable
components are only closed once the sstable object is destructed.
Incremental compaction is not working for major compaction because a reference
to released sstables are being kept in the compaction manager, which prevents
their disk usage from being released. So the space amplification would be
the same as with a non-incremental approach, i.e. needs twice the amount of
used disk space for the table(s). With this issue fixed, the database now
becomes very major compaction friendly, the space requirement becoming very
low, a constant which is roughly number of fragments being currently compacted
multiplied by fragment size (1GB by default), for each table involved.
Fixes#5140.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20191003211927.24153-1-raphaelsc@scylladb.com>
Make sure gc'able-tombstone-only sstable is properly generated with data that
comes from regular compaction's input sstable.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Compaction prevents data resurrection from happening by checking that there's
no way a data shadowed by a GC'able tombstone will survive alone, after
a failure for example.
Consider the following scenario:
We have two runs A and B, each divided to 5 fragments, A1..A5, B1..B5.
They have the following token ranges:
A: A1=[0, 3] A2=[4, 7] A3=[8, 11] A4=[12, 15] A5=[16,18]
B is the same as A's ranges, offset by 1:
B: B1=[1,4] B2=[5,8] B3=[9,12] B4=[13,16] B5=[17,19]
Let's say we are finished flushing output until position 10 in the compaction.
We are currently working on A3 and B3, so obviously those cannot be deleted.
Because B2 overlaps with A3, we cannot delete B2 either.
Otherwise, B2 could have a GC'able tombstone that shadows data in A3, and after
B2 is gone, dead data in A3 could be resurrected *on failure*.
Now, A2 overlaps with B2 which we couldn't delete yet, so we can't delete A2.
Now A2 overlaps with B1 so we can't delete B1. And B1 overlaps with A1 so
we can't delete A1. So we can't delete any fragment.
The problem with this approach is obvious, fragments can potentially not be
released due to data dependency, so incremental compaction efficiency is
severely reduced.
To fix it, let's not purge GC'able tombstones right away in the mutation
compactor step. Instead, let's have compaction writing them to a separate
sstable run that would be deleted in the end of compaction.
By making sure that tombstone information from all compacting sstables is not
lost, we no longer need to have incremental compaction imposing lots of
restriction on which fragments could be released. Now, any sstable which data
is safe in a new sstable can be released right away. In addition, incremental
compaction will only take place if compaction procedure is working with one
multi-fragment sstable run at least.
Fixes#4531.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
* seastar 1f68be436f...e888b1df9c (8):
> sharded: Make map work with mapper that returns a future
> cmake: Remove FindBoost.cmake
> Reduce noncopyable_function instruction cache footprint
> doc: add Loops section to the tutorial
> Merge "Move file related code out of reactor" from Asias
> Merge "Move the io_queue code out of reactor" from Asias
> cmake: expose seastar_perf_testing lib
> future: class doc: explain why discarding a future is bad
- main.cc now includes new file io_queue.hh
- perf tests now include seastar perf utilities via user, not
system, includes since those are not exported
Merged patch set from Piotr Sarna:
Refs #5046
This commit adds handling "Authorization:" header in incoming requests.
The signature sent in the authorization is recomputed server-side
and compared with what the client sent. In case of a mismatch,
UnrecognizedClientException is returned.
The signature computation is based on boto3 Python implementation
and uses gnutls to compute HMAC hashes.
This series is rebased on a previous HTTPS series in order to ease
merging these two. As such, it depends on the HTTPS series being
merged first.
Tests: alternator(local, remote)
The series also comes with a simple authorization test and a docs update.
Piotr Sarna (6):
alternator: migrate split() function to string_view
alternator: add computing the auth signature
config: add alternator_enforce_authorization entry
alternator: add verifying the auth signature
alternator-test: add a basic authorization test case
docs: update alternator authorization entry
alternator-test/test_authorization.py | 34 ++++++++
configure.py | 1 +
alternator/{server.hh => auth.hh} | 22 ++---
alternator/server.hh | 3 +-
db/config.hh | 1 +
alternator/auth.cc | 88 ++++++++++++++++++++
alternator/server.cc | 112 +++++++++++++++++++++++---
db/config.cc | 1 +
main.cc | 2 +-
docs/alternator/alternator.md | 7 +-
10 files changed, 241 insertions(+), 30 deletions(-)
create mode 100644 alternator-test/test_authorization.py
copy alternator/{server.hh => auth.hh} (58%)
create mode 100644 alternator/auth.cc
Before this change, when populating non-system keyspaces, each data
directory was scanned and for each entry (keyspace directory),
a keyspace was populated. This was done in a serial fashion - populating
of one keyspace was not started until the previous one was done.
Loading keyspaces in such fashion can introduce unnecessary waiting
in case of a large number of keyspaces in one data directory. Population
process is I/O intensive and barely uses CPU.
This change enables parallel loading of keyspaces per data directory.
Populating the next keyspace does not wait for the previous one.
A benchmark was performed measuring startup time, with the following
setup:
- 1 data directory,
- 200 keyspaces,
- 2 tables in each keyspace, with the following schema:
CREATE TABLE tbl (a int, b int, c int, PRIMARY KEY(a, b))
WITH CLUSTERING ORDER BY (b DESC),
- 1024 rows in each table, with values (i, 2*i, 3*i) for i in 0..1023,
- ran on 6-core virtual machine running on i7-8750H CPU,
- compiled in dev mode,
- parameters: --smp 6 --max-io-requests 4 --developer-mode=yes
--datadir $DIR --commitlog-directory $DIR
--hints-directory $DIR --view-hints-directory $DIR
The benchmark tested:
- boot time, by comparing timestamp of the first message in log,
and timestamp of the following message:
"init - Scylla version ... initialization completed."
- keyspace population time, by comparing timestamps of messages:
"init - loading non-system sstables"
and
"init - starting view update generator"
The benchmark was run 5 times for sequential and parallel version,
with the following results:
- sequential: boot 31.620s, keyspace population 6.051s
- parallel: boot 29.966s, keyspace population 4.360s
Keyspace population time decreased by ~27.95%, and overall boot time
by about ~5.23%.
Tests: unit(release)
Fixes#2007
The signature sent in the "Authorization:" header is now verified
by computing the signature server-side with a matching secret key
and confirming that the signatures match.
Currently the secret key is hardcoded to be "whatever" in order
to work with current tests, but it should be replaced
by a proper key store.
Refs #5046
The config entry will be used to turn authorization for alternator
requests on and off. The default is currently off, since the key store
is not implemented yet.
A function for computing the auth signature from user requests
is added, along with helper functions. The implementation
is based on gnutls's HMAC.
Refs #5046
The implementation of string split was based on sstring type for
simplicity, but it turns out that more generic std::string_view
will be beneficial later to avoid unneeded string copying.
Unfortunately boost::split does not cooperate well with string views,
so a simple manual implementation is provided instead.
Schema changes can have big effects on performance, typically it should
be a rare event.
It is usefull to monitor how frequently the schema changed.
This patch adds a counter that increases each time a schema changed.
After this patch the metrics would look like:
scylla_database_schema_changed{shard="0",type="derive"} 2
Fixes#4785
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
We can use the reader::peek() to check if the reader contains any data.
If not, do not open the rpc stream connection. It helps to reduce the
port usage.
Refs: #4943
Both in a single-statement transaction and in a batch
we expect that serial consistency is provided. Move the
check to query_options class and make it available for
reuse.
Keep get_serial_consistency() around for use in
transport/server.cc.
Message-Id: <20191006154532.54856-2-kostja@scylladb.com>
When another node is reported to be down, view updates queued
for it are cancelled, but some of them may already be initiated.
Right now, cancelling such a write resulted in an exception,
but on conceptual level it's not really an exception, since
this behaviour is expected.
Previous version of this patch was based on introducing a special
exception type that was later handled specially, but it's not clear
if it's a good direction. Instead, this patch simply makes this
path non-exceptional, as was originally done by Nadav in the first
version of the series that introduced handling unstarted write
cancellations. Additionally, a message containing the information
that a write is cancelled is logged with debug level.
README.md has 3 fixes applied:
- s/alternator_tls_port/alternator_https_port
- conf directory is mentioned more explicitly
- it now correctly states that the self-signed certificate
warning *is* explicitly ignored in tests
Message-Id: <e5767f7dbea260852fc2fa9b613e1bebf490cc78.1570444085.git.sarna@scylladb.com>
"
Fixes#5134, Eviction concurrent with preempted partition entry update after
memtable flush may allow stale data to be populated into cache.
Fixes#5135, Cache reads may miss some writes if schema alter followed by a
read happened concurrently with preempted partition entry update.
Fixes#5127, Cache populating read concurrent with schema alter may use the
wrong schema version to interpret sstable data.
Fixes#5128, Reads of multi-row partitions concurrent with memtable flush may
fail or cause a node crash after schema alter.
"
* tag 'fix-cache-issues-with-schema-alter-and-eviction-v2' of github.com:tgrabiec/scylla:
tests: row_cache: Introduce test_alter_then_preempted_update_then_memtable_read
tests: row_cache_stress_test: Verify all entries are evictable at the end
tests: row_cache_stress_test: Exercise single-partition reads
tests: row_cache_stress_test: Add periodic schema alters
tests: memtable_snapshot_source: Allow changing the schema
tests: simple_schema: Prepare for schema altering
row_cache: Record upgraded schema in memtable entries during update
memtable: Extract memtable_entry::upgrade_schema()
row_cache, mvcc: Prevent locked snapshots from being evicted
row_cache: Make evict() not use invalidate_unwrapped()
mvcc: Introduce partition_snapshot::touch()
row_cache, mvcc: Do not upgrade schema of entries which are being updated
row_cache: Use the correct schema version to populate the partition entry
delegating_reader: Optimize fill_buffer()
row_cache, memtable: Use upgrade_schema()
flat_mutation_reader: Introduce upgrade_schema()
Merged patch series from Piotr Sarna:
This series adds HTTPS support for Alternator.
The series comes with --https option added to alternator-test, which makes
the test harness run all the tests with HTTPS instead of HTTP. All the tests
pass, albeit with security warnings that a self-signed x509 certificate was
used and it should not be trusted.
Fixes#5042
Refs scylladb/seastar#685
Patches:
docs: update alternator entry on HTTPS
alternator-test: suppress the "Unverified HTTPS request" warning
alternator-test: add HTTPS info to README.md
alternator-test: add HTTPS to test_describe_endpoints
alternator-test: add --https parameter
alternator: add HTTPS support
config: add alternator HTTPS port
* seastar c21a7557f9...1f68be436f (6):
> scheduling: Add per scheduling group data support
> build: Include dpdk as a single object in libseastar.a
> sharded: fix foreign_ptr's move assignment
> build: Fix DPDK libraries linking in pkg-config file
> http server: https using tls support
> Make output_stream blurb Doxygen
The BEGINS_WITH condition in conditional updates (via Expected) requires
that the given operand be either a string or a binary. Any other operand
should result in a validation exception - not a failed condition as we
generate now.
This patch fixes the test for this case so it will succeed against
Amazon DynamoDB (before this patch it fails - this failure was masked by
a typo before commit 332ffa77ea). The patch
then fixes our code to handle this case correctly.
Note that BEGINS_WITH handling of wrong types is now asymmetrical: A bad
type in the operand is now handled differently from a bad type in the
attribute's value. We add another check to the test to verify that this
is the case.
Fixes#5141
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20191006080553.4135-1-nyh@scylladb.com>
When debugging one constantly has to inspect object for which only
a "virtual pointer" is available, that is a pointer that points to a
common parent class or interface.
Finding the concrete type and downcasting the pointer is easy enough but
why do it manually when it is possible to automate it trivially?
$downcast_vptr() returns any virtual pointer given to it, casted to the
actual concrete object.
Exlample:
(gdb) p $1
$2 = (flat_mutation_reader::impl *) 0x60b03363b900
(gdb) p $downcast_vptr(0x60b03363b900)
$3 = (combined_mutation_reader *) 0x60b03363b900
# The return value can also be dereferenced on the spot.
(gdb) p *$downcast_vptr($1)
$4 = {<flat_mutation_reader::impl> = {_vptr.impl = 0x46a3ea8 <vtable
for combined_mutation_reader+16>, _buffer = {_impl = {<std::al...
Dereferencing an `seastar::lw_shared_ptr` is another tedious manual
task. The stored pointer (`_p`) has to be casted to the right subclass
of `lw_shared_ptr_counter_base`, which involves inspecting the code,
then make writing a cast expression that gdb is willing to parse. This
is something machines are so much better at doing.
`$dereference_lw_shared_ptr` returns a pointer to the actual pointed-to
object, given an instance of `seastar::lw_shared_ptr`.
Example:
(gdb) p $1._read_context
$2 = {_p = 0x60b00b068600}
(gdb) p $dereference_lw_shared_ptr($1._read_context)
$3 = {<seastar::enable_lw_shared_from_this<cache::read_context>>
= {<seastar::lw_shared_ptr_counter_base> = {_count = 1}, ...
Make all the parameters of the sampling tweakable via command line
arguments. I strived to keep full backward compatibility, but due to the
limitations of `argparse` there is one "breaking" change. The optional
positional size argument is now a non-positional argument as `argparse`
doesn't support optional positional arguments.
Added documentation for both the command itself as well as for all the
arguments.
make_single_key_reader() currently doesn't actually create
single-partition readers because it doesn't set
mutation_reader::forwarding::no when it creates individual
readers. The readers will default to mutation_reader::forwarding::yes
and actually create scanning readers in preparation for
fast-forwarding across partitions.
Fix by passing mutation_reader::forwarding::no.
Currently, methods of simple_schema assume that table's schema doesn't
change. Accessors like get_value() assume that rows were generated
using simple_schema::_s. Because if that, the column_definition& for
the "v" column is cached in the instance. That column_definiion&
cannot be used to access objects created with a different schema
version. To allow using simple_schema after schema changes,
column_definition& caching is now tagged with the table schema version
of origin. Methods which access schema-dependent objects, like
get_value(), are now accepting schema& corresponding to the objects.
Also, it's now possible to tell simple_schema to use a different
schema version in its generator methods.
Cache update may defer in the middle of moving of partition entry
from a flushed memtable to the cache. If the schema was changed since
the entry was written, it upgrades the schema of the partition_entry
first but doesn't update the schema_ptr in memtable_entry. The entry
is removed from the memtable afterward. If a memtable reader
encounters such an entry, it will try to upgrade it assuming it's
still at the old schema.
That is undefined behavior in general, which may include:
- read failures due to bad_alloc, if fixed-size cells are interpreted
as variable-sized cells, and we misinterpret a value for a huge
size
- wrong read results
- node crash
This doesn't result in a permanent corruption, restarting the node
should help.
It's the more likely to happen the more rows there are in a
partition. It's unlikely to happen with single-row partitions.
Introduced in 70c7277.
Fixes#5128.
If the whole partition entry is evicted while being updated from the
memtable, a subsequent read may populate the partition using the old
version of data if it attempts to do it before cache update advances
past that partition. Partial eviction is not affected because
populating reads will notice that there is a newer snapshot
corresponding to the updater.
This can happen only in OOM situations where the whole cache gets evicted.
Affects only tables with multi-row partitions, which are the only ones
that can experience the update of partition entry being preempted.
Introduced in 70c7277.
Fixes#5134.
invalidate_unwrapped() calls cache_entry::evict(), which cannot be
called concurrently with cache update. invalidate() serializes it
properly by calling do_update(), but evict() doesn't. The purpose of
evict() is to stress eviction in tests, which can happen concurrently
with cache update. Switch it to use memory reclaimer, so that it's
both correct and more realistic.
evict() is used only in tests.
When a read enters a partition entry in the cache, it first upgrades
it to the current schema of the cache. The same happens when an entry
is updated after a memtable flush. Upgrading the entry is currently
performed by squashing all versions and replacing them with a single
upgraded version. That has a side effect of detaching all snapshots
from the partition entry. Partition entry update on memtable flush is
writing into a snapshot. If that snapshot is detached by a schema
upgrade, the entry will be missing writes from the memtable which fall
into continuous ranges in that entry which have not yet been updated.
This can happen only if the update of the entry is preempted and the
schema was altered during that, and a read hit that partition before
the update went past it.
Affects only tables with multi-row partitions, which are the only ones
that can experience the update of partition entry being preempted.
The problem is fixed by locking updated entries and not upgrading
schema of locked entries. cache_entry::read() is prepared for this,
and will upgrade on-the-fly to the cache's schema.
Fixes#5135
The sstable reader which populates the partition entry in the cache is
using the schema of the partition entry snapshot, which will be the
schema of the cache at the time the partition was entered. If there
was a schema change after the cache reader entered the partition but
before it created the sstable reader, the cache populating reader will
interpret sstable fragments using the wrong schema version. That is
more likely if partitions have many rows, and the front of the
partition is populated. With single-row partitions that's unlikely to
happen.
That is undefined behavior in general, which may include:
- read failures due to bad_alloc, if fixed-size cells are
interpreted as variable-sized cells, and we misinterpret
a value for a huge size
- wrong read results
- node crash
This doesn't result in a permanent corruption, restarting the node
should help.
Fixes#5127.
Use move_buffer_content_to() which is faster than fill_buffer_from()
because it doesn't involve popping and pushing the fragments across
buffers. We save on size estimation costs.
Running with --https and a self-signed certificate results in a flood
of expected warnings, that the connection is not to be trusted.
These warnings are silenced, as users runing a local test with --https
usually use self-signed certificates.
The test_describe_endpoints test spawns another client connection
to the cluster, so it needs to be HTTPS-aware in order to work properly
with --https parameter.
Running with --https parameter will result in sending the requests
via HTTPS instead of HTTP. By default, port 8043 is used for a local
cluster. Before running pytest --https, make sure that Scylla
was properly configured to initialize a HTTPS alternator server
by providing the alternator_tls_port parameter.
The HTTPS-based connection runs with verification disabled,
otherwise it would not work with self-signed certificates,
which are useful for tests.
By providing a server based on a TLS socket, it's now possible
to serve HTTPS requests in alternator. The HTTPS server is enabled
by setting its port in scylla.yaml: alternator_tls_port=XXXX.
Alternator TLS relies on the existing TLS configuration,
which is provided by certificate, keyfile, truststore, priority_string
options.
Fixes#5042
The test test_update_expression_function_nesting() fails because DynamoDB
don't allow an expression like list_append(list_append(:val1, :val2), :val3)
but Alternator doesn't check for this (and supports this expression).
The "xfail" message was outdated, suggesting that the test fails because
the "SET" expression isn't supported - but it is. So replace the message
by a more accurate one.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190915104708.30471-1-nyh@scylladb.com>
Merged patch set from Dejan Mircevski implementing some of the
missing operators for Expected: NE, IN, NULL and NOT_NULL.
Patches:
alternator: Factor out Expected operand checks
alternator: Implement NOT_NULL operator in Expected
alternator: Implement NULL operator in Expected
alternator: Fix expected_1_null testcase
alternator: Implement IN operator in Expected
alternator: Implement NE operator in Expected
alternator: Factor out common code in Expected
Frozen empty lists/map/sets are not equal to null value,
whil multi-cell empty lists/map/sets are equal to null values.
Return a NULL value for an empty multi-cell set or list
if we know the receiver is not frozen - this makes it
easy to compare the parameter with the receiver.
Add a test case for inserting an empty list or set
- the result is indistinguishable from NULL value.
Message-Id: <20191003092157.92294-2-kostja@scylladb.com>
"
Fix races that may lead to use-after-free events and file system level exceptions
during shutdown and drain.
The root cause of use-after-free events in question is that space_watchdog blocks on
end_point_hints_manager::file_update_mutex() and we need to make sure this mutex is alive as long as
it's accessed even if the corresponding end_point_hints_manager instance
is destroyed in the context of manager::drain_for().
File system exceptions may occur when space_watchdog attempts to scan a
directory while it's being deleted from the drain_for() context.
In case of such an exception new hints generation is going to be blocked
- including for materialized views, till the next space_watchdog round (in 1s).
Issues that are fixed are #4685 and #4836.
Tested as follows:
1) Patched the code in order to trigger the race with (a lot) higher
probability and running slightly modified hinted handoff replace
dtest with a debug binary for 100 times. Side effect of this
testing was discovering of #4836.
2) Using the same patch as above tested that there are no crashes and
nodes survive stop/start sequences (they were not without this series)
in the context of all hinted handoff dtests. Ran the whole set of
tests with dev binary for 10 times.
"
* 'hinted_handoff_race_between_drain_for_and_space_watchdog_no_global_lock-v2' of https://github.com/vladzcloudius/scylla:
hinted handoff: fix a race on a directory removal between space_watchdog and drain_for()
hinted handoff: make taking file_update_mutex safe
db::hints::manager::drain_for(): fix alignment
db::hints::manager: serialize calls to drain_for()
db::hints: cosmetics: identation and missing method qualifier
The operation after gate.enter() in tracker::start() can fail and throw,
we should call gate.leave() in such case to avoid unbalanced enter and
leave calls. tracker::done() has similar issue too.
Fix it by removing the gate enter and leave logic in tracker start and
done. A helper tracker::run() is introduced to take care of the gate and
repair status.
In addition, the error log is improved. It now logs exceptions on all
shards in the summary. e.g.,
[shard 0] repair - repair id 1 failed: std::runtime_error
({shard 0: std::runtime_error (error0), shard 1: std::runtime_error (error1)})
Fixes#5074
Currently, the population stat is not increased for entries that are
evicted immediately on insert, however the code that does the eviction
still decreases the population stat, leading to an imbalance and in some
cases the underflow of the population stat. To fix, unconditionally
increase the population stat upon inserting an entry, regardless of
whether it is immediately evicted or not.
Fixes: #5123
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20191001153215.82997-1-bdenes@scylladb.com>
Put all AttributeValuelist size verification under
verify_operand_count(), rather than have some cases invoke
verify_operand_count() while others verify it in check_*() functions.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
Add check_IN() and a switch case that invokes it. Reactivate IN
tests. Add a testcase for non-scalar attribute values.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
Recognize "NE" as a new operator type, add check_NE() function, invoke
it in verify_expected_one(), and reactivate NE tests.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
Operand-count verification will be repeated a lot as more operators
are implemented, so factor it out into verify_operand_count().
Also move `got` null checks to check_* functions, which reduces
duplication at call sites.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
While a managed_ref emulates a reference more closely than it does
a pointer, it is still nullable, so add a get() (similar to
unique_ptr::get()) that can be nullptr if the reference is null.
The immediate use will be mutation_partition::_static_row, which
is often empty and takes up about 10% of a cache entry.
The example Python code had wrong indentation, and wouldn't actually
work if naively copy-pasted. Noticed by Noam Hasson.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190929091440.28042-1-nyh@scylladb.com>
"
This is a collection of assorted patches that will be needed for LWT.
Most of them are trivial, but one touches a lot of files, so have a
good chance to cause rebase headache (I already had to rebase it on
top of Alternator). Lets push them earlier instead of carrying them in
the lwt branch.
"
* 'gleb/lwt-prepare-v2' of github.com:scylladb/seastar-dev:
lwt: make _last_timestamp_micros static
lwt: Add client_state::get_timestamp_for_paxos() function
lwt: Pass client_state reference all the way to storage_proxy::query
exceptions: Add a constructor for unavailable_exception that allows providing a custom message
serializer: Add std::variant support
lwt: Add missing functions to utils/UUID_gen.hh
"
This is the second version of the patch series. The previous one was just the second patch, this one adds more tests an another patch to make it easier to test that the new code has the same behavior as the old one.
"
* 'espindola/overflow-is-intentional' of https://github.com/espindola/scylla:
types: Simplify and explain from_varint_to_integer
Add more cast tests
Affects single-partition reads only.
Refs #5113
When executing a query on the replica we do several things in order to
narrow down the sstable set we read from.
For tables which use LeveledCompactionStrategy, we store sstables in
an interval set and we select only sstables whose partition ranges
overlap with the queried range. Other compaction strategies don't
organize the sstables and will select all sstables at this stage. The
reasoning behind this is that for non-LCS compaction strategies the
sstables' ranges will typically overlap and using interval sets in
this case would not be effective and would result in quadratic (in
sstable count) memory consumption.
The assumption for overlap does not hold if the sstables come from
repair or streaming, which generates non-overlapping sstables.
At a later stage, for single-partition queries, we use the sstables'
bloom filter (kept in memory) to drop sstables which surely don't
contain given partition. Then we proceed to sstable indexes to narrow
down the data file range.
Tables which don't use LCS will do unnecessary I/O to read index pages
for single-partition reads if the partition is outside of the
sstable's range and the bloom filter is ineffective (Refs #5112).
This patch fixes the problem by consulting sstable's partition range
in addition to the bloom filter, so that the non-overlapping sstables
will be filtered out with certainty and not depend on bloom filter's
efficiency.
It's also faster to drop sstables based on the keys than the bloom
filter.
Tests:
- unit (dev)
- manual using cqlsh
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190927122505.21932-1-tgrabiec@scylladb.com>
The method sstable::estimated_keys_for_range() was severely
under-estimating the number of partitions in an sstable for a given
token range.
The first reason is that it underestimated the number of sstable index
pages covered by the range, by one. In extreme, if the requested range
falls into a single index page, we will assume 0 pages, and report 1
partition. The reason is that we were using
get_sample_indexes_for_range(), which returns entries with the keys
falling into the range, not entries for pages which may contain the
keys.
A single page can have a lot of partitions though. By default, there
is a 1:20000 ratio between summary entry size and the data file size
covered by it. If partitions are small, that can be many hundreds of
partitions.
Another reason is that we underestimate the number of partitions in an
index page. We multiply the number of pages by:
(downsampling::BASE_SAMPLING_LEVEL * _components->summary.header.min_index_interval)
/ _components->summary.header.sampling_level
Using defaults, that means multiplying by 128. In the cassandra-stress
workload a single partition takes about 300 bytes in the data file and
summary entry is 22 bytes. That means a single page covers 22 * 20'000
= 440'000 bytes of the data file, which contains about 1'466
partitions. So we underestimate by an order of magnitude.
Underestimating the number of partitions will result in too small
bloom filters being generated for the sstables which are the output of
repair or streaming. This will make the bloom filters ineffective
which results in reads selecting more sstables than necessary.
The fix is to base the estimation on the number of index pages which
may contain keys for the range, and multiply that by the average key
count per index page.
Fixes#5112.
Refs #4994.
The output of test_key_count_estimation:
Before:
count = 10000
est = 10112
est([-inf; +inf]) = 512
est([0; 0]) = 128
est([0; 63]) = 128
est([0; 255]) = 128
est([0; 511]) = 128
est([0; 1023]) = 128
est([0; 4095]) = 256
est([0; 9999]) = 512
est([5000; 5000]) = 1
est([5000; 5063]) = 1
est([5000; 5255]) = 1
est([5000; 5511]) = 1
est([5000; 6023]) = 128
est([5000; 9095]) = 256
est([5000; 9999]) = 256
est(non-overlapping to the left) = 1
est(non-overlapping to the right) = 1
After:
count = 10000
est = 10112
est([-inf; +inf]) = 10112
est([0; 0]) = 2528
est([0; 63]) = 2528
est([0; 255]) = 2528
est([0; 511]) = 2528
est([0; 1023]) = 2528
est([0; 4095]) = 5056
est([0; 9999]) = 10112
est([5000; 5000]) = 2528
est([5000; 5063]) = 2528
est([5000; 5255]) = 2528
est([5000; 5511]) = 2528
est([5000; 6023]) = 5056
est([5000; 9095]) = 7584
est([5000; 9999]) = 7584
est(non-overlapping to the left) = 0
est(non-overlapping to the right) = 0
Tests:
- unit (dev)
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190927141339.31315-1-tgrabiec@scylladb.com>
`dbuild` was recently (24c732057) updated to run in interactive mode
when given no arguments; we can now update the README to mention that.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
When the toppartitions operation gathers results, it copies partition
keys with their schema_ptr:s. When these schema_ptr:s are copies
or destroyed, they can cause leaks or premature frees of the schema
in its original shard since reference count operations in are not atomic.
Fix that by converting the schema_ptr to a global_schema_ptr during
transportation.
Fixes#5104 (direct bug)
Fixes#5018 (schema prematurely freed, toppartitions previously executed on that node)
Fixes#4973 (corrupted memory pool of the same size class as schema, toppartitions previously executed on that node)
Tests: new test added that fails with the existing code in debug mode,
manual toppartitions test
Copying schema_ptrs across shards results in memory corruption since
lw_shared_ptr does not use atomic operations for reference counts.
Prevent that by converting schema_ptr:s to global_schema_ptr:s before
shipping them across shards in the map operation, and converting them
back to local schema_ptr:s in the reduce operation.
This allows keys from different stages in the schema's like to compare equal.
This is safe since the partition key cannot change, unlike the rest of the schema.
More importantly, it will allow us to compare keys made local after a pass through
global_schema_ptr, which does not guarantee that the schema_ptr conversion will be
the same even when starting with the same global_schema_ptr.
Throwing move constructors are a a pain; so we should try to make
them noexcept. Currently, global_schema_ptr's move constructor
throws an exception if used illegaly (moving from a different shard);
this patch changes it to an assert, on the grounds that this error
is impossible to recover from.
The direct motivation for the patch is the desire to store objects
containing a global_schema_ptr in a chunked_vector, to move lists
of partition keys across shards for the topppartitions functionality.
chunked_vector currently requires noexcept move constructors for its
value_type.
When a user type changes we were not recreating other uses types that
use it. This patch series fixes that and makes it clear which code is
responsible for it.
In the system.types table a user type refers to another by name. When
a user type is modified, only its entry in the table is changed.
At runtime a user type has direct pointer to the types it uses. To
handle the discrepancy we need to recreate any dependent types when a
entry in system.types changes.
Fixes#5049
If each client_state has its own copy of the variable two clients may
generate timestamps that clash and needlessly create contention. Making
the variable shared between all client_state on the same shard will make
sure this will not happen to two clients on the same shard. It may still
happen for two client on two different shards or two different nodes.
Paxos needs a unique timestamp that is greater than some other
timestamp, so that the next round had more chances to succeed.
Add a function that returns such a timestamp.
client_state holds a state to generate monotonically increasing unique
timestamp. Queries with a SERIAL consistency level need it to generate
a paxos round.
In the system.types table a user type refers to another by name. When
a user type is modified, only its entry in the table is changed.
At runtime a user type has direct pointer to the types it uses. To
handle the discrepancy we need to recreate any dependent types when a
entry in system.types changes.
Fixes#5049
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The way schema changes propagate is by editing the system tables and
comparing the before and after state.
When a user type A uses another user type B and we modify B, the
representation of A in the system table doesn't change, so this code
was not producing any changes on the diff that the receiving side
uses.
Deleting it makes it clear that it is the receiver's responsibility to
handle dependent user types.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
With this patch db::cql_type_parser::raw_builder creates a local copy
of the list of existing types and uses that internally. By doing that
build() should have no observable behavior other than returning the
new types.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
We were never passing a null pointer and never saving a copy of the
lw_shared_ptr. Passing a reference is more flexible as not all callers
are required to hold the user_types_metadata in a lw_shared_ptr.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
* seastar b56a8c5045...c21a7557f9 (3):
> net: socket::{set,get}_reuseaddr() should not be virtual
> iotune: print verbose message in case of shutdown errors
> iotune: close test file on shutdown
Fixes#4946.
1. Add assert in remove_response_handler to make crashes like in #5032 easier to understand.
2. Lookup the view_update_write_response_handler id before calling timeout_cb and tolerate it not found.
Just log a warning if this happened.
Fixes#5032
"
Currently affects only counter tables.
Introduced in 27014a2.
mutation_partition(s, mp) is incorrect because it uses s to interpret
mp, while it should use mp_schema.
We may hit this if the current node has a newer schema than the
incoming mutation. This can happen during table schema altering when we receive the
mutation from a node which hasn't processed the schema change yet.
This is undefined behavior in general. If the alter was adding or
removing columns, this may result in corruption of the write where
values of one column are inserted into a different column.
Fixes#5095.
"
* 'fix-schema-alter-counter-tables' of https://github.com/tgrabiec/scylla:
mvcc: Fix incorrect schema verison being used to copy the mutation when applying
mutation_partition: Track and validate schema version in debug builds
tests: Use the correct schema to access mutation_partition
Currently affects only counter tables.
Introduced in 27014a2.
mutation_partition(s, mp) is incorrect, because it uses s to interpret
mp, while it should use mp_schema.
We may hit this if the current node has a newer schema than the
incoming mutation. This can happen during alter when we receive the
mutation from a node which hasn't processed the schema change yet.
This is undefined behavior in general. If the alter was adding or
removing columns, this may result in corruption of the write where
values of one column are inserted into a different column.
Fixes#5095.
This patch makes mutation_partition validate the invariant that it's
supposed to be accessed only with the schema version which it conforms
to.
Refs #5095
* seastar e51a1a8ed9...b56a8c5045 (3):
> net: add support for UNIX-domain sockets
> future: Warn on promise::set_exception with no corresponding future or task
> Merge "Handle exceptions in repeat_until_value and misc cleanups" from Rafael
Handle a race where a write handler is removed from _response_handlers
but not yet from _view_update_handlers_list.
Fixes#5032
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Refactor remove_response_handler_entry out of remove_response_handler,
to be called on a valid iterator found by _response_handlers.find(id).
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Help identify cases like seen in #5032 where the handler id
wasn't found from the on_down -> timeout_cb path.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
compaction_manager::perform_sstable_upgrade() fails when it feeds
compaction mechanism with shared sstables. Shared sstables should
be ignored when performing upgrade and so wait for reshard to pick
them up in parallel. Whenever a shared sstable is brought up either
on restart or via refresh, reshard procedure kicks in.
Reshard picks the highest supported format so the upgrade for
shared sstable will naturally take place.
Fixes#5056.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20190925042414.4330-1-raphaelsc@scylladb.com>
- Update the outdated comments in do_stop_gossiping. It was
storage_service not storage_proxy that used the lock. More
importantly, storage_service does not use it any more.
- Drop the unused timer_callback_lock and timer_callback_unlock API
- Use with_semaphore to make sure the semaphore usage is balanced.
- Add log in gossiper::do_stop_gossiping when it tries to take the
semaphore to help debug hang during the shutdown.
Refs: #4891
Refs: #4971
A documentation file that is intended to be a place for anything
debugging related: getting started tutorial, tips and tricks and
advanced guides.
For now it contains a short introductions, some selected links to
more in-depth documentation and some trips and tricks that I could think
off the top of my head.
One of those tricks describes how to load cores obtained from
relocatable packages inside the `dbuild` container. I originally
intended to add that to `tools/toolchain/README.md` but was convinced
that `docs/debugging.md` would be a better place for this.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190924133110.15069-1-bdenes@scylladb.com>
Recently we have seen a case where the population stat of the cache was
corrupt, either due to misaccounting or some more serious corruption.
When debugging something like that it would have been useful to know how
many items have been inserted to the cache. I also believe that such a
counter could be useful generally as well.
Refs: #4918
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190924083429.43038-1-bdenes@scylladb.com>
"
We observed an abort on bad_alloc which was not caused by real OOM,
but could be explained by cache region being locked from a different
shard, which is not allowed, concurrently with memory reclamation.
It's impossible now to prove this, or, if that was indeed the case, to
determine which code path was attempting such lock. This patch adds an
assert which would catch such incorrect locking at the attempt.
Refs #4978
Tests:
- unit (dev, release, debug)
"
* 'assert-no-xshard-lsa-locking' of https://github.com/tgrabiec/scylla:
lsa: Assert no cross-shard region locking
tests: Make managed_vector_test a seastar test
* seastar 2a526bb120...e51a1a8ed9 (2):
> rpc: introduce rpc::tuple as a way to move away from variadic future
> shared_future: don't warn on broken futures
Make it easier for the IDE to resolve references to the seastar
namespace. In any case include files should be stand-alone and not
depend on previously included files.
The build directory is meaningless, since it is typically some
directory in a continuous integration server. That means someone
debugging the relocatable package needs to issue the gdb command
'set substitute-path' with the correct arguments, or they lose
source debugging. Doing so in the relocatable package build saves
this step.
The default build is not modified, since a typical local build
benefits from having the paths hardcoded, as the debugger will
find the sources automatically.
We observed an abort on bad_alloc which was not caused by real OOM,
but could be explained by cache region being locked from a different
shard, which is not allowed, concurrently with memory reclamation.
It's impossible now to prove this, or, if that was indeed the case, to
determine which code path was attempting such lock. This patch adds an
assert which would catch such incorrect locking at the attempt.
Refs #4978
LCS demotes a SSTable from a given level when it thinks that level is inactive.
Inactive level means N rounds (compaction attempt) without any activity in it,
in other words, no SSTable has been promoted to it.
The problem happens because the metadata that tracks inactiveness of each level
can be incorrectly updated when there's an ongoing compaction. LCS has parallel
compaction disabled. So if a table finds itself running a long operation like
cleanup that blocks minor compaction, LCS could incorrectly think that many
levels need demotion, and by the time cleanup finishes, some demotions would
incorrectly take place.
This problem is fixed by only updating the counter that tracks inactiveness
when compaction completes, so it's not incorrectly updated when there's an
ongoing compaction for the table.
Fixes#4919.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20190917235708.8131-1-raphaelsc@scylladb.com>
A recent fix to #3767 limited the amount of ranges that
can return from query_ranges_to_vnodes_generator. This with
the combination of a large amount of token ranges can lead to
an infinite recursion. The algorithm multiplies by factor of
2 (actualy a shift left by one) the amount of requested
tokens in each recursion iteration. As long as the requested
number of ranges is greater than 0, the recursion is implicit,
and each call is scheduled separately since the call is inside
a continuation of a map reduce.
But if the amount of iterations is large enough (~32) the
counter for requested ranges zeros out and from that moment on
two things will happen:
1. The counter will remain 0 forever (0*2 == 0)
2. The map reduce future will be immediately available and this
will result in the continuation being invoked immediately.
The latter causes the recursive call to be a "regular" recursive call
thus, through the stack and not the task queue of the scheduler, and
the former causes this recursion to be infinite.
The combination creates a stack that keeps growing and eventually
overflows resulting in undefined behavior (due to memory overrun).
This patch prevent the problem from happening, it limits the growth of
the concurrency counter beyond twice the last amount of tokens returned
by the query_ranges_to_vnodes_generator.And also makes sure it is not
get stuck at zero.
Testing: * Unit test in dev mode.
* Modified add 50 dtest that reproduce the problem
Fixes#4944
Signed-off-by: Eliran Sinvani <eliransin@scylladb.com>
Message-Id: <20190922072838.14957-1-eliransin@scylladb.com>
Before this patch, if the _gate is closed, with_gate throws and
forward_to is not executed. When the promise<> p is destroyed it marks
its _task as a broken promise.
What happens next depends on the branch.
On master, we warn when the shared_future is destroyed, so this patch
changes the warning from a broken_promise to a gate closed.
On 3.1, we warn when the promises in shared_future::_peers are
destroyed since they no longer have a future attached: The future that
was attached was the "auto f" just before the with_gate call, and it
is destroyed when with_gate throws. The net result is that this patch
fixes the warning in 3.1.
I will send a patch to seastar to make the warning on master more
consistent with the warning in 3.1.
Fixes#4394
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20190917211915.117252-1-espindola@scylladb.com>
Scylla currently crashes if we run manual operations like nodetool
compact with the controller disabled. While we neither like nor
recommend running with the controller disabled, due to some corner cases
in the controller algorithm we are not yet at the point in which we can
deprecate this and are sometimes forced to disable it.
The reason for the crash is that manual operations will invoke
_backlog_of_shares, which returns what is the backlog needed to
create a certain number of shares. That scan the existing control
points, but when we run without the controller there are no control
points and we crash.
Backlog doesn't matter if the controller is disabled, and the return
value of this function will be immaterial in this case. So to avoid the
crash, we return something right away if the controller is disabled.
Fixes#5016
Signed-off-by: Glauber Costa <glauber@scylladb.com>
gdb searches for libthread_db.so using its canonical name of libthread_db.so.1 rather
than the file name of libthread_db-1.0.so, so use that name to store the file in the
archive.
Fixes#4996.
* seastar b3fb4aaab3...84d8e9fe9b (8):
> Use aio fsync if available
> Merge "fix some tcp connection bugs and add reuseaddr option to a client socket" from Gleb
> lz4: use LZ4_decompress_safe
> reactor: document seastar::remove_file()
> core/file.hh: remove redundant std::move()
> core/{file,sstring}: do not add `const` to return value
> http/api_docs: always call parent constructor
> Add input_stream blurb
Currently, if updating bookkeeping operations for view building fails,
we log the error message and continue. However, during shutdown,
some errors are more likely to happen due to existing issues
like #4384. To differentiate actual errors from semi-expected
errors during shutdown, the latter are now logged with a warning
level instead of error.
Fixes#4954
Shutdown routines are usually implemented via the deferred_action
mechanism, which runs a function in its destructor. We thus expect
the function to be noexcept, but unfortunately it's not always
the case. Throwing in the destructor results in terminating the program
anyway, but before we do that, the exception can be logged so it's
easier to investigate and pinpoint the issue.
Example output before the patch:
INFO 2019-09-10 12:49:05,858 [shard 0] view - Stopping view builder
terminate called without an active exception
Aborting on shard 0.
Backtrace:
0x000000000184a9ad
(...)
Example output after the patch:
INFO 2019-09-10 12:49:05,858 [shard 0] view - Stopping view builder
ERROR 2019-09-10 12:49:05,858 [shard 0] init - Unexpected error on shutdown: std::runtime_error (Hello there!)
terminate called without an active exception
Aborting on shard 0.
Backtrace:
0x000000000184a9ad
(...)
This simplifies the implementation of from_varint_to_integer and
avoids using the fact that a static_cast from cpp_int to uint64_t
seems to just keep the low 64 bits.
The boost release notes
(https://www.boost.org/users/history/version_1_67_0.html) implies that
the conversion function should return the maximum value a uint64_t can
hold if the original value is too large.
The idea of using a & with ~0 is a suggestion from the boost release
notes.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Update current results dictionary using the Metric.discover method.
New results are added and missing results are marked as absent.
(Both full metrics or specific keys)
Previously, with prometheous, each metric.update called query_list
resulting in O(n^2) when all metric were updated, like in the scylla_top
dtest - causing test timeout when testing debug build.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Commit log replay was bypassing memtable space back-pressure, and if
replay was faster than memtable flush, it could lead to OOM.
The fix is to call database::apply_in_memory() instead of
table::apply(). The former blocks when memtable space is full.
Fixes#4982.
Tests:
- unit (release)
- manual, replay with memtable flush failin and without failing
Message-Id: <1568381952-26256-1-git-send-email-tgrabiec@scylladb.com>
If the user supplies the 'replication_factor' to the 'NetworkTopologyStrategy' class,
it will expand into a replication factor for each existing DC for their convenience.
Resolves#4210.
Signed-off-by: Kamil Braun <kbraun@scylladb.com>
This reverts commit 7f64a6ec4b.
Fixes#5011
The reverted commit exposes #3760 for all schemas, not only those
which have UDTs.
The problem is that table schema deserialization now requires keyspace
to be present. If the replica hasn't received schema changes which
introduce the keyspace yet, the write will fail.
Mention on the top-level README.md that Scylla by default is compatible
with Cassandra, but also has experimental support for DynamoDB's API.
Provide links to alternator/alternator.md and alternator/getting-started.md
with more information about this feature.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190911080913.10141-1-nyh@scylladb.com>
"
In this patch set, written by Piotr Sarna and myself, we add Alternator - a new
Scylla feature adding compatibility with the API of Amazon DynamoDB(TM).
DynamoDB's API uses JSON-encoded requests and responses which are sent over
an HTTP or HTTPS transport. It is described in detail on Amazon's site:
https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/
Our goal is that any application written to use Amazon DynamoDB could
be run, unmodified, against Scylla with Alternator enabled. However, at this
stage the Alternator implementation is incomplete, and some of DynamoDB's
API features are not yet supported. The extent of Alternator's compatibility
with DynamoDB is described in the document docs/alternator/alternator.md
included in this patch set. The same document also describes Alternator's
design (and also points to a longer design document).
By default, Scylla continues to listen only to Cassandra API requests and not
DynamoDB API requests. To enable DynamoDB-API compatibility, you must set
the alternator-port configuration option (via command line or YAML) to the port on
which you wish to listen for DynamoDB API requests. For more information, see
docs/alternator/alternator.md. The document docs/alternator/getting-started.md
also contains some examples of how to get started with Alternator.
"
* 'alternator' of https://github.com/nyh/scylla: (272 commits)
Added comments about DAX, monitoring and more
alternator: fix usage of client_state
alternator-test: complete test_expected.py for rest of comparison operators
alternator-test: reproduce bug in Expected with EQ of set value
alternator: implement the Expected request parameter
alternator: add returning PAY_PER_REQUEST billing mode
alternator: update docs/alternator.md on GSI/LSI situation
Alternator: Add getting started document for alternator
move alternator.md to its own directory
alternator-test: add xfail test for GSI with 2 regular columns
alternator/executor.cc: Latencies should use steady_clock
alternator-test: fix LSI tests
alternator-test: fix test_describe_endpoints.py for AWS run
alternator-test: test_describe_endpoints.py without configuring AWS
alternator: run local tests without configuring AWS
alternator-test: add LSI tests
alternator-test: bump create table time limit to 200s
alternator: add basic LSI support
alternator: rename reserved column name "attrs"
alternator: migrate make_map_element_restriction to string view
...
This patch adds tests for all the missing comparion operators in the
Expected parameter (the old-style parameter for conditional operations).
All these new tests are now xfailing on Alternator (and succeeding on
DynamoDB), because these operators are not yet implemented in Alternator
(we only implemented EQ and BEGINS_WITH, so far - the rest are easy but
need to be implemented).
The test_expected.py is now hopefully comprehensive, covering the entire
feature set of the "Expected" parameter and all its various cases and
subcases.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190910092208.23461-1-nyh@scylladb.com>
Our implementation of the "EQ" operator in Expected (conditional
operation) just compares the JSON represntation of the values.
This is almost always correct, but unfortunately incorrect for
sets - where we can have two equal sets despite having a
different order.
This patch just adds an (xfailing) test for this bug.
The bug itself can be fixed in the future in one of several ways
including changing the implementation of EQ, or changing the
serialization of sets so they'll always be sorted in the same
way.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190909125147.16484-1-nyh@scylladb.com>
In this patch we implement the Expected parameter for the UpdateItem,
PutItem and DeleteItem operations. This parameter allows a conditional
update - i.e., do an update only if the existing value of the item
matches some condition.
This is the older form of conditional updates, but is still used by many
applications, including Amazon's Tic-Tac-Toe demo.
As usual, we do not yet provide isolation guarantees for read-modify-write
operations - the item is simply read before the modification, and there is
no protection against concurrent operation. This will of course need to be
addressed in the future.
The Expected parameter has a relatively large number of variations, and most
of them are supported by this code, except that currenly only two comparison
operators are supported (EQ and BEGINS_WITH) out of the 13 listed in the
documentation. The rest will be implemented later.
This patch also includes comprehensive tests for the Expected feature.
These tests are almost exhaustive, except for one missing part (labled FIXME) -
among the 13 comparison operations, the tests only check the EQ and BEGINS_WITH
operators. We'll later need to add checks to the rest of them as well.
As usual, all the tests pass on Amazon DynamoDB, and after this patch all
of them succeed on Alternator too.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190905125558.29133-1-nyh@scylladb.com>
In order for Spark jobs to work correctly, a hardcoded PAY_PER_REQUEST
billing mode entry is returned when describing a table with
a DescribeTable request.
Also, one test case in test_describe_table.py is no longer marked XFAIL.
Message-Id: <a4e6d02788d8be48b389045e6ff8c1628240197c.1567688894.git.sarna@scylladb.com>
This patch adds a getting started document for alternator,
it explains how to start up a cluster that has an alternator
API port open and how to test that it works using either an
application or some simple and minimal python scripts.
The goal of the document is to get a user to have an up and
running docker based cluster with alternator support in the
shortest time possible.
As part of trying to make alternator more accessible
to users, we expect more documents to be created so
it seems like a good idea to give all of the alternator
docs their own directory.
When updating the second regular base column that is also a view
key, the code in Scylla will assume it only needs to update an entry
instead of replacing an old one. This leads to inconsitencies
exposed in the test case.
Message-Id: <5dfeb9f61f986daa6e480e9da4c7aabb5a09a4ec.1567599461.git.sarna@scylladb.com>
LSI tests are amended, so they no longer needlessly XPASS:
* two xpassing tests are no longer marked XFAIL
* there's an additional test for partial projection
that succeeds on DynamoDB and does not work fine yet in alternator
Message-Id: <0418186cb6c8a91de84837ffef9ac0947ea4e3d3.1567585915.git.sarna@scylladb.com>
The previous patch fixed test_describe_endpoints.py for a local run
without an AWS configuration. But when running with "--aws", we do
need to use that AWS configuration, and this patch fixes this case.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Even when running against a local Alternator, Boto3 wants to know the
region name, and AWS credentials, even though they aren't actually needed.
For a local run, we can supply garbage values for these settings, to
allow a user who never configured AWS to run tests locally.
Running against "--aws" will, of course, still require the user to
configure AWS.
The previous patch already fixed this for most tests, this patch fixes the
same issue in test_describe_endpoints.py, which had a separate copy of the
problematic code.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Even when running against a local Alternator, Boto3 wants to know the
region name, and AWS credentials, even though they aren't actually needed.
For a local run, we can supply garbage values for these settings, to
allow a user who never configured AWS to run tests locally.
Running against "--aws" will, of course, still require the user to
configure AWS.
Also modified the README to be clearer, and more focused on the local
runs.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190708121420.7485-1-nyh@scylladb.com>
Unfortunately the previous 100s limit proved to be not enough
for creating tables with both local and global indexes attached
to them. Empirically 200s was chosen as a safe default,
as the longest test oscillated around 100s with the deviation of 10s.
With this patch, LocalSecondaryIndexes can be added to a table
during its creation. The implementation is heavily shared
with GlobalSecondaryIndexes and as such suffers from the same TODOs:
projections, describing more details in DescribeTable, etc.
We currently reserve the column name "attrs" for a map of attributes,
so the user is not allowed to use this name as a name of a key.
We plan to lift this reservation in a future patch, but until we do,
let's at least choose a more obscure name to forbid - in this patch ":attrs".
It is even less likely that a user will want to use this specific name
as a column name.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190903133508.2033-1-nyh@scylladb.com>
Currently, we reserve the name ATTRS_COLUMN_NAME ("attrs") - the user
cannot use it as a key column name (key of the base table or GSI or LSI)
because we use this name for the attribute map we add to the schema.
Currently, if the user does attempt to create such a key column, the
result is undefined (sometimes corrupt sstables, sometimes outright crashes).
This patches fixes it to become a clean error, saying that this column name is
currently reserved.
The test test_create_table_special_column_name now cleanly fails, instead
of crashing Scylla, so it is converted from "skip" to "xfail".
Eventually we need to solve this issue completely (e.g., in rare cases
rename columns to allow us to reserve a name like ATTRS_COLUMN_NAME,
or alternatively, instead of using a fixed name ATTRS_COLUMN_NAME pick a
different one different from the key column names). But until we do,
better fail with a clear error instead of a crash.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190901102832.7452-1-nyh@scylladb.com>
The file initially consists of a very simple case that succeeds
with `--aws` and expectedly fails without it, because the expression
is not implemented yet.
This adds a "alternator-address" and "alternator-port" configuration
options to the Docker image, so people can enable Alternator with
"docker run" with:
docker run --name some-scylla -d <image> --alternator-port=8080
Message-Id: <20190902110920.19269-1-penberg@scylladb.com>
When an unsupported expression parameter is encountered -
KeyConditionExpression, ConditionExpression or FilterExpression
are such - alternator will return an error instead of ignoring
the parameter.
This patch make two chagnes to the alternator stats:
1. It add estimated_histogram for the get, put, update and delete
operation
2. It changes the metrics naming, so the operation will be a label, it
will be easier to handle, perform operation and display in this way.
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
The test_gsi_3, involving creating a GSI with two key columns which weren't
previously a base key, now passes, so drop the "xfail" marker.
We still have problems with such materialized views, but not in the simple
scenario tested by test_gsi_3.
Later we should create a new test for the scenario which still fails, if
any.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Creating an underlying materialized view with 2 regular base columns
is risky in Scylla, as second's column liveness will not be correctly
taken into account when ensuring view row liveness.
Still, in case specific conditions are met:
* the regular base column value is always present in the base row
* no TTLs are involved
then the materialized view will behave as expected.
Creating a GSI with 2 base regular columns issues a warning,
as it should be performed with care.
Message-Id: <5ce8642c1576529d43ea05e5c4bab64d122df829.1567159633.git.sarna@scylladb.com>
It is important that BillingMode should default to PROVISIONED, as it
does on DynamoDB. This allows old clients, which don't specify
BillingMode at all, to specify ProvisionedThroughput as allowed with
PROVISIONED.
Also added a test case for this case (where BillingMode is absent).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190829193027.7982-1-nyh@scylladb.com>
When querying on a missing index, DynamoDB returns different errors in
case the entire table is missing (ResourceNotFoundException) or the table
exists and just the index is missing (ValidationException). We didn't
make this distinction, and always returned ValidationException, but this
confuses clients that expect ResourceNotFoundException - e.g., Amazon's
Tic-Tac-Toe demo.
This patch adds a test for the first case (the completely missing table) -
we already had a test for the second case - and returns the correct
error codes. As usual the test passes against DynamoDB as well as Alternator,
ensure they behave the same.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190829174113.5558-1-nyh@scylladb.com>
We needlessly split the trace-level log message for the request to two
messages - one containing just the operation's name, and one with the
parameters. Moreover we printed them in the opposite order (parameters
first, then the operation). So this patch combines them into one log
message.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190829165341.3600-1-nyh@scylladb.com>
Alternator puts in the Scylla table a column called "attrs" for all the
non-key attributes. If the user happens to choose the same name, "attrs",
for one of the key columns, the result of writing two different columns
with the same name is a mess and corrupt sstables.
This test reproduces this bug (and works against DynamoDB of course).
Because the test doesn't cleanly fail, but rather leaves Scylla in a bad
state from which it can't fully recover, the test is marked as "skip"
until we fix this bug.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190828135644.23248-1-nyh@scylladb.com>
Updating key columns is not allowed in UpdateItem requests,
but the series introducing GSI support for regular columns
also introduced redundant duplicates checks of this kind.
This condition is already checked in resolve_update_path helper function
and existing test_update_expression_cannot_modify_key test makes sure that
the condition is checked.
Message-Id: <00f83ab631f93b263003fb09cd7b055bee1565cd.1567086111.git.sarna@scylladb.com>
The test test_update_expression_cannot_modify_key() verifies that an
update expression cannot modify one of the key columns. The existing
test only tried the SET and REMOVE actions - this patch makes the
test more complete by also testing the ADD and DELETE actions.
This patch also makes the expected exception more picky - we now
expect that the exception message contains the word "key" (as it,
indeed, does on both DynamoDB and Alternator). If we get any other
exception, there may be a problem.
The test passed before this patch, and passes now as well - it's just
stricter now.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190829135650.30928-1-nyh@scylladb.com>
The code previously used clustering_key::from_singular() to compute
a clustering key value. It works fine, but has two issues:
1. involves one redundant deserialization stage compared to
from_single_value
2. does not work with compound clustering keys, which can appear
when using indexes
With more GSI features implemented, tests with XPASS status are promoted
to being enabled.
One test case (test_gsi_describe) is partially done as DescribeTable
now contains index names, but we could try providing more attributes
(e.g. IndexSizeBytes and ItemCount from the test case), so the test
is left in the XFAIL state.
The DescribeTable request now contains the list of index names
as well. None of the attributes of the list are marked as 'required'
in the documentation, so currently the implementation provides
index names only.
In order to be able to create a Global Secondary Index over a regular
column, this column is upgraded from being a map entry to being a full
member of the schema. As such, it's possible to use this column
definition in the underlying materialized view's key.
In order to prepare alternator for adding regular columns to schema,
i.e. in order to create a materialized view over them,
the code is changed so that updating no longer assumes that only keys
are included in the table schema.
Since in the future we may want to have more regular columns
in alternator tables' schemas, the code is changed accordingly,
so all regular columns will be fetched instead of just the attribute
map.
If no regular column attributes are passed to PutItem, the attr
collector serializes an empty collection mutation nonetheless
and sends it. It's redundant, so instead, if the attr colector
is empty, the collection does not get serialized and sent to replicas.
Keeping an instance of client_state is a convenient way of being able
to use tracing for alternator. It's also currently used in paging,
so adding a client state to executor removes the need of keeping
a dummy value.
String views used in JSON serialization should use not only the pointer
returned by rapidjson, but also the string length, as it may contain
\0 characters.
Additionally, one unnecessary copy is elided.
Add a link to a longer document (currently, around 40 pages) about
DynamoDB's features and how we implemented or may implement them in
Alternator.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190825121201.31747-2-nyh@scylladb.com>
If a user tries to create a table with a unsupported feature -
a local secondary index, a used-defined encryption key or supporting
streams (CDC), let's refuse the table creation, so the application
doesn't continue thinking this feature is available to it.
The "Tags" feature is also not supported, but it is more harmless
(it is used mostly for accounting purposes) so we do not fail the
table creation because of it.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190818125528.9091-1-nyh@scylladb.com>
In CQL, before a user can create a table, they must create a keyspace to
contain this table and, among other things, specify this keyspace's RF.
But in the DynamoDB API, there is no "create keyspace" operation - the
user just creates a table, and there is no way, and no opportunity,
to specify the requested RF. Presumably, Amazon always uses the same
RF for all tables, most likely 3, although this is not officially
documented anywhere.
The existing code creates the keyspace during Scylla boot, with RF=1.
This RF=1 always works, and is a good choice for a one-node test run,
but was a really bad choice for a real cluster with multiple nodes, so
this patch fixes this choice:
With this patch, the keyspace creation is delayed - it doesn't happen
when the first node of the cluster boots, but only when the user creates
the first table. Presumably, at that time, the cluster is already up,
so at that point we can make the obvious choice automatically: a one-node
cluster will get RF=1, a >=3 node cluster will get RF=3. The choice of
RF is logged - and the choice of RF=1 is considered a warning.
Note that with this patch, keyspace creation is still automatic as it
was before. The user may manually create the keyspace via CQL, to
override this automatic choice. In the future we may also add additional
keyspace configuration options via configuration flags or new REST
requests, and the keyspace management code will also likely change
as we start to support clusters with multiple regions and global
tables. But for now, I think the automatic method is easiest for
users who want to test-drive Alternator without reading lengthy
instructions on how to set up the keyspace.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190820180610.5341-1-nyh@scylladb.com>
We allow BillingMode to be set to either PAY_PER_REQUEST (the default)
or PROVISIONED, although neither mode is fully implemented: In the former
case the payment isn't accounted, and in the latter case the throughput
limits are not enforced.
But other settings for BillingMode are now refused, and we add a new test
to verify that.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190818122919.8431-1-nyh@scylladb.com>
The alternator tests want to exercise many of the DynamoDB API features,
so they need a recent enough version of the client libraries, boto3
and botocore. In particular, only in botocore 1.12.54, released a year
ago, was support for BillingMode added - and we rely on this to create
pay-per-request tables for our tests.
Instead of letting the user run with an old version of this library and
get dozens of mysterious errors, in this patch we add a test to conftest.py
which cleanly aborts the test if the libraries aren't new enough, and
recommends a "pip" command to upgrade these libraries.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190819121831.26101-1-nyh@scylladb.com>
The DescribeTable operation was currently implemented to return the
minimal information that libraries and applications usually need from
it, namely verifying that some table exists. However, this operation
is actually supposed to return a lot more information fields (e.g.,
the size of the table, its creation date, and more) which we currently
don't return.
This patch adds a new test file, test_describe_table.py, testing all
these additional attributes that DescribeTable is supposed to return.
Several of the tests are marked xfail (expected to fail) because we
did not implement these attributes yet.
The test is exhaustive except for attributes that have to do with four
major features which will be tested together with these features: GSI,
LSI, streams (CDC), and backup/restore.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190816132546.2764-1-nyh@scylladb.com>
Currently Alternator starts all Scylla requests (including both reads
and writes) without any timeout set. Because of bugs and/or network
problems, Requests can theoretically hang and waste Scylla request for
hours, long after the client has given up on them and closed their
connection.
The DynamoDB protocol doesn't let a user specify which timeout to use,
so we should just use something "reasonable", in this patch 10 seconds.
Remember that all DynamoDB read and write requests are small (even scans
just scan a small piece), so 10 seconds should be above and beyond
anything we actually expect to see in practice.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190812105132.18651-1-nyh@scylladb.com>
So far we had the "--alternator-port" option allowing to configure the port
on which the Alternator server listens on, but the server always listened
to any address. It is important to also be able to configure the listen
address - it is useful in tests running several instances of Scylla on
the same machine, and useful in multi-homed machines with several interfaces.
So this patch adds the "--alternator-address" option, defaulting to 0.0.0.0
(to listen on all interfaces). It works like the many other "--*-address"
options that Scylla already has.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190808204641.28648-1-nyh@scylladb.com>
It turns out that recent rjson patches introduced some buggy
tabs instead of spaces due to bad IDE configuration. The indentation
is restored to spaces.
Until now, filtering in alternator was possible only for non-key
column equality relations. This commit adds support for equality
relations for key columns.
Alternator allows passing hash and sort key restrictions
as filters - it is, however, better to incorporate these restrictions
directly into partition and clustering ranges, if possible.
It's also necessary, as optimizations inside restrictions_filter
assume that it will not be fed unneeded rows - e.g. if filtering
is not needed on partition key restrictions, they will not be checked.
Currently the only utility function for getting key bytes
from JSON was to parse a document with the following format:
"key_column_name" : { "key_column_type" : VALUE }.
However, it's also useful to parse only the inner document, i.e.:
{ "key_column_type" : VALUE }.
Three metrics related to filtering are added to alternator:
- total rows read during filtering operations
- rows read and matched by filtering
- rows read and dropped by filtering
Some underlying operations (e.g. paging) make use of cql_stats
structure from CQL3. As such, cql_stats structure is added
to alternator stats in order to gather and use these statistics.
Read-before-write stat counters were already introduced, but the metrics
needs to be added to a metric group as well in order to be available
for users.
This patch adds partial support for GSI (Global Secondary Index) in
Alternator, implemented using a materialized view in Scylla.
This initial version only supports the specific cases of the index indexing
a column which was already part of the base table's key - e.g., indexing
what used to be a sort key (clustering key) in the base table. Indexing
of non-key attributes (which today live in a map) is not yet supported in
this version.
Creation of a table with GSIs is supported, and so is deleting the table.
UpdateTable which adds a GSI to an existing table is not yet supported.
Query and Scan operations on the index are supported.
DescribeTable does not yet list the GSIs as it should.
Seven previously-failing tests now pass, so their "xfail" tag is removed.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190808090256.12374-1-nyh@scylladb.com>
The rapidjson library needs to be used with caution in order to
provide maximum performance and avoid undefined behavior.
Comments added to rjson.hh describe provided methods and potential
pitfalls to avoid.
Message-Id: <ba94eda81c8dd2f772e1d336b36cae62d39ed7e1.1565270214.git.sarna@scylladb.com>
With libjsoncpp we were forced to work around the problem of
non-noexcept constructors by using an intermediate unique pointer.
Objects provided by rapidjson have correct noexcept specifiers,
so the workaround can be dropped.
Profiling alternator implied that JSON parsing takes up a fair amount
of CPU, and as such should be optimized. libjsoncpp is a standard
library for handling JSON objects, but it also proves slower than
rapidjson, which is hereby used instead.
The results indicated that libjsoncpp used roughly 30% of CPU
for a single-shard alternator instance under stress, while rapidjson
dropped that usage to 18% without optimizations.
Future optimizations should include eliding object copying, string copying
and perhaps experimenting with different JSON allocators.
Migrating from libjsoncpp to rapidjson proved to be beneficial
for parsing performance. As a first step, a set of helper functions
is provided to ease the migration process.
error.hh file implicitly assumed that seastar:: namespace is available
when it's included, which is not always the case. To remedy that,
seastar::httpd namespace is used explicitly.
Our CreateTable handler assumed that the function
migration_manager::announce_new_column_family()
returns a failed future if the table already exists. But in some of
our code branches, this is not the case - the function itself throws
instead of returning a failed future. The solution is to use
seastar::futurize_apply() to handle both possibilities (direct exception
or future holding an exception).
This fixes a failure of the test_table.py::test_create_table_already_exists
test case.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This adds a new document, docs/alternator.md, about Alternator.
The scope of this document should be expanded in the future. We begin
here by introducing Alternator and its current compatibility level with
Amazon DynamoDB, but it should later grow to explain the design of Alternator
and how it maps the DynamoDB data model onto Scylla's.
Whether this document should remain a short high-level overview, or a long
and detailed design document, remains an open question.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190805085340.17543-1-nyh@scylladb.com>
The function attrs_type() return a supposedly singleton, but because
it is a seastar::shared_ptr we can't use the same one for multiple
threads, and need to use a separate one per thread.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190804163933.13772-1-nyh@scylladb.com>
The CQL type singletons like utf8_type et al. are separate for separate
shards and cannot be used across shards. So whatever hash tables we use
to find them, also needs to be per-shard. If we fail to do this, we
get errors running the debug build with multiple shards.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190804165904.14204-1-nyh@scylladb.com>
Expand the GSI test suite. The most important new test is
test_gsi_key_not_in_index(), where the index's key includes just one of
the base table's key columns, but not a second one. In this case, the
Scylla implementation will nevertheless need to add the second key column
to the view (as a clustering key), even though it isn't considered a key
column by the DynamoDB API.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190718085606.7763-1-nyh@scylladb.com>
Our ListTables implementation uses get_column_families(), which lists both
base tables and materialized views. We will use materialized views to
implement DynamoDB's secondary indexes, and those should not be listed in
the results of ListTables.
The patch also includes a test for this.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190717133103.26321-2-nyh@scylladb.com>
The list_tables() utility function was used only in test_table.py
but I want to use it elsewhere too (in GSI test) so let's move it
to util.py.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190717133103.26321-1-nyh@scylladb.com>
As in case of set_diff, an exception message in set_sum should include
the user-provided request (ADD) rather than our internal helper function
set_sum.
Although we do not support GSI yet, until now we silently ignored
CreateTable's GSI parameter, and the user wouldn't know the table
wasn't created as intended.
In this patch, GSI is still unsupported, but now CreateTable will
fail with an error message that GSI is not supported.
We need to change some of the tests which test the error path, and
expect an error - but should not consider a table creation error
as the expected error.
After this patch, test_gsi.py still fails all the tests on
Alternator, but much more quickly :-)
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190711161420.18547-1-nyh@scylladb.com>
The test case for adding two sets with common values is added.
This case is a stub, because boto3 transforms the result into a Python
set, which removes duplicates on its own. A proper TODO is left
in order to migrate this case to a lower-level API and check
the returned JSON directly for lack of duplicates.
The Query operation's conditions can be used to search for a particular
hash key or both hash and sort keys - but not any other combinations.
We previously forgot to verify most errors, so in this patch we add
missing verifications - and tests to confirm we fail the query when
DynamoDB does.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190711132720.17248-1-nyh@scylladb.com>
Add more tests for GSI - tests that DescribeTable describes the GSI,
and test the case of more than one GSI for a base table.
Unfortunately, creating an empty table with two GSIs routinely takes
on DynamoDB more than a full minute (!), so because we now have a
test with two GSIs, I had to increase the timeout in create_test_table().
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190711112911.14703-1-nyh@scylladb.com>
The holds_path() utility function is actually used to check if a value
needs read before write, so its name is changed to more fitting
check_needs_read_before_write.
Alternator currently keeps an item's attributes inside a map, and we
had a serious bug in the way we build mutations for this map:
We didn't know there was a requirement to build this mutation sorted by
the attribute's name. When we neglect to do this sorting, this confuses
Scylla's merging algorithms, which assume collection cells are thus
sorted, and the result can be duplicate cells in a collection, and the
visible effect is a mutation that seems to be ignored - because both
old and new values exist in the collection.
So this patch includes a new helper class, "attribute_collector", which
helps collect attribute updates (put and del) and extract them in correctly
sorted order. This helper class also eliminates some duplication of
arcane code to create collection cells or deletions of collection cells.
This patch includes a simple test that previously failed, and one xfail
test that failed just because of this bug (this was the test that exposed
this bug). Both tests now succeed.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190709160858.6316-1-nyh@scylladb.com>
This patch adds what is hopefully an exhaustive test suite for the
global secondary indexing (GSI) feature, and all its various
complications and corner cases of how GSIs can be created, deleted,
named, written, read, and more (the tests are heavily documented to
explain what they are testing).
All these tests pass on DynamoDB, and fail on Alternator, so they are
marked "xfail". As we develop the GSI feature in Alternator piece by
piece, we should make these tests start to pass.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190708160145.13865-1-nyh@scylladb.com>
This adds another test for BatchWriteItem: That if one of the operations is
invalid - e.g., has a wrong key type - the entire batch is rejected, and not
none of its operations are done - even the valid ones.
The test succeeds, because we already handle this case correctly.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190707134610.30613-1-nyh@scylladb.com>
Test an operation like SET #one = #two, where the RHS has a reference
to a name, rather than the name itself. Also verify that DynamoDB
gives an error if ExpressionAttributeNames includes names not needed
by neither left or right hand side of such assignments.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190708133311.11843-1-nyh@scylladb.com>
In order to serve update requests that depend on read-before-write,
a proper helper function which fetches the existing item with a given
key from the database is added.
This read-before-write mechanism is not considered safe, because it
provides no linearizability guarantees and offers no synchronization
protection. As such, it should be consider a placeholder that works
fine on a single machine and/or no concurrent access to the same key.
The calculate_value utility function is going to need more context
in order to resolve paths present in the right-hand side of update_item
operators: update_info and schema.
Historically, resolving a path checked for key columns, which are not
allowed to be on the left-hand side of the assignment. However, path
resolving will now also be used for right-hand side, where it should
be allowed to use the key value.
In order to implement read-before-write in the future, calculate_value
now accepts an additional parameter: previous_item. If read-before-write
was performed, previous_item will contain an item for the given key
which already exists in the database at the time of the update.
This patch moves the create_test_table() utility function, which creates
a test table with a unique name, from the fixtures (conftest.py) to
util.py. This will allow reusing this function in tests which need to
create tables but not through the existing fixtures. In particular
we will need to do this for GSI (global secondary index) tests
in the next patch.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190708104438.5830-1-nyh@scylladb.com>
The tests we had for BatchWriteItem's refusal to accept duplicate keys
only used test_table_s, with just a hash key. This patch adds tests
for test_table, i.e., a table with both hash and sort keys - to check
that we check duplicates in that case correctly as well.
Moreover, the expanded tests also verify that although identical
keys are not allowed, keys with just one component (hash or sort key)
the same but the other not the same - are fine.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190705191737.22235-1-nyh@scylladb.com>
Even when running against a local Alternator, Boto3 wants to know the
region name, and AWS credentials, even though they aren't actually needed.
For a local run, we can supply garbage values for these settings, to
allow a user who never configured AWS to run tests locally.
Running against "--aws" will, of course, still require the user to
configure AWS.
Also modified the README to be clearer, and more focused on the local
runs.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190708121420.7485-1-nyh@scylladb.com>
For "--aws" tests, use the default region chosen by the user in the
AWS configuration (~/.aws/config or environment variable), instead of
hard-coding "us-east-1".
Patch by Pekka Enberg.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190708105852.6313-1-nyh@scylladb.com>
Calculating value represented as 'v1 + v2' or 'v1 - v2' was previously
implemented with a double type, which offers limited precision.
From now on, these computations are based on big_decimal, which
allows returning values without losing precision.
This patch depends on 'add big_decimal arithmetic operators' series.
Message-Id: <f741017fe3d3287fa70618068bdc753bfc903e74.1562318971.git.sarna@scylladb.com>
Move some common utility functions to a common file "util.py"
instead of repeating them in many test files.
The utility functions include random_string(), random_bytes(),
full_scan(), full_query(), and multiset() (the more general
version, which also supports freezing nested dicts).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190705081013.1796-1-nyh@scylladb.com>
The idiomatic way to use an std::variant depending the type holds is to use
std::visit. This modern API makes it unnecessary to write many boiler-plate
functions to test and cast the type of the variant, and makes it impossible
to forget one of the options. So in this patch we throw out the old ways,
and welcome the new.
Thanks to Piotr Sarna for the idea.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190704205625.20300-1-nyh@scylladb.com>
This patch adds to Alternator an implementation of the BatchGetItem
operation, which allows to start a number of GetItem requests in parallel
in a single request.
The implementation is almost complete - the only missing feature is the
ability to ask only for non-top-level attributes in ProjectionExpression.
Everything else should work, and this patch also includes tests which,
as usual, pass on DynamoDB and now also on Alternator.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Amazingly, it appears we never tested booting Alternator a second time :-)
Our initialization code creates a new keyspace, and was supposed to ignore
the error if this keyspace already existed - but we thought the error will
come as an exceptional future, which it didn't - it came as a thrown
exception. So we need to change handle_exception() to a try/catch.
With this patch, I can kill Alternator and it will correctly start again.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Operations which take a key as parameter, namely GetItem, UpdateItem,
DeleteItem and BatchWriteItem's DeleteRequest, already fail if the given
key is missing one of the nessary key attributes, or has the wrong types
for them. But they should also fail if the given key has spurious
attributes beyond those actually needed in a key.
So this patch adds this check, and tests to confirm that we do these checks
correctly.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The PutItem operation, and also the PutRequest of BatchWriteItem, are
supposed to completely replace the item - not to merge the new value with
the previous value. We implemented this wrongly - we just wrote the new
item forgetting a tombstone to remove the old item.
So this patch fixes these operations, and adds tests which confirm the
fix (as usual, these tests pass on DynamoDB, failed on Alternator before
this patch, and pass after the patch).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Add support for the DeleteItem operation, which deletes an item.
The basic deletion operation is supported. Still not supported are:
1. Parameters to conditionally delete (ConditionalExpression or Expected)
2. Parameters to return pre-delete content
3. ReturnItemCollectionMetrics (statistics relevant for tables with LSI)
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
In BatchWriteItem, we currently only support the PutRequest operation.
If a user tries to use DeleteRequest (which we don't support yet), he
will get a bizarre error. Let's test the request type more carefully,
and print a better error message. This will also be the place where
eventually we'll actually implement the DeleteRequest.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch adds more comprehensive tests for the BatchWriteItem operation,
in a new file batch_test.py. The one test we already had for it was also
moved from test_item.py here.
Some of the test still xfail for two reasons:
1. Support for the DeleteRequest operation of BatchWriteItem is missing.
2. Tests that forbid duplicate keys in the same request are missing.
As usual, all tests succeed on DynamoDB, and hopefully (I tried...)
cover all the BatchWriteItem features.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
DynamoDB has two similar parameters - AttributesToGet and
ProjectionExpression - which are supported by the GetItem, Scan and
Query operations. Until now we supported only the older AttributesToGet,
and this patch adds support to the newer ProjectionExpression.
Besides having a different syntax, the main difference between
AttributesToGet and ProjectionExpression is that the latter also
allows fetching only a specific nested attribute, e.g., a.b[3].c.
We do not support this feature yet, although it would not be
hard to add it: With our current data representation, it means
fetching the top-level attribute 'a', whose value is a JSON, and then
post-filtering it to take out only the '.b[3].c'. We'll do that
later.
This patch also adds more test cases to test_projection_expression.py.
All tests except three which check the nested attributes now pass,
and those three xfail (they succeed on DynamoDB, and fail as expected
on Alternator), reminding us what still needs to be done.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Our GetItem, Query and Scan implementations support the AttributesToGet
parameter to fetch only a subset of the attributes, but we don't yet
support the more elaborate ProjectionExpression parameter, which is
similar but has a different syntax and also allows to specify nested
document paths.
This patch adds existive testing of all the ProjectionExpression features.
All these tests pass against DynamoDB, but fail against the current
Alternator so they are marked "xfail". These tests will be helpful for
developing the ProjectionExpression feature.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The AttributesToGet parameter - saying which attributes to fetch for each
item - is already supported in the GetItem, Query and Scan operations.
However, we only had a test for it for it for Scan. This patch adds
similar tests also for the GetItem and Query operations.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Yet another test for overwriting a top-level attribute which contains
a nested document - here, overwriting it by just a string.
This test passes. In the current implementation we don't yet support
updates to specific attribute paths (e.g. a.b[3].c) but we do support
well writing and over-writing top-level attributes.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch implements the last (finally!) syntactic feature of the
UpdateExpression - the ability to do SET a=val1+val2 (where, as
before, each of the values can be a reference to a value, an
attribute path, or a function call).
The implementation is not perfect: It adds the values as double-precision
numbers, which can lose precision. So the patch adds a new test which
checks that the precision isn't lost - a test that currently fails
(xfail) on Alternator, but passes on DynamoDB. The pre-existing test
for adding small integer now passes on Alternator.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
In the previous patch we added function-call support in the UpdateExpression
parser. In this patch we add support for one such function - list_append().
This function takes two values, confirms they are lists, and concatenates
them. After this patch only one function remains unimplemented:
if_not_exists().
We also split the test we already had for list_append() into two tests:
One uses only value references (":val") and passes after this patch.
The second test also uses references to other attributes and will only
work after we start supporting read-modify-write.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Until this patch, in update expressions like "SET a = :val", we only
allowed the right-hand-side of the assignment to be a reference to a
value stored in the request - like ":val" in the above example.
But DynamoDB also allows the value to be an attribute path (e.g.,
"a.b[3].c", and can also be a function of a bunch of other values.
This patch adds supports for parsing all these value types.
This patch only adds the correct parsing of these additional types of
values, but they are still not supported: reading existing attributes
(i.e., read-modify-write operations) is still not supported, and
none of the two functions which UpdateExpression needs to support
are supported yet. Nevertheless, the parsing is now correct, and the
the "unknown_function" test starts to pass.
Note that DynamoDB allows the right-hand side of an assignment to be
not only a single value, but also value+value and value-value. This
possibility is not yet supported by the parser and will be added
later.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The test cases verify that equality-based filtering on non-key
attributes works fine. It also contains test stubs for key filtering
and non-equality attribute filtering.
Filled test table used to have identical non-key attributes for all
rows. These values are now diversified in order to allow writing
filtering test cases.
Filtering is currently only implemented for the equality operator
on non-key attributes.
Next steps (TODO) involve:
1. Implementing filtering for key restrictions
2. Implementing non-key attribute filtering for operators other than EQ.
It, in turn, may involve introducing 'map value restrictions' notion
to Scylla, since now it only allows equality restrictions on map
values (alternator attributes are currently kept in a CQL map).
3. Implementing FilterExpression in addition to deprecated QueryFilter
Before this patch, we read either an attribute name like "name" or
a reference to one "#name", as one type of token - NAME.
However, while attribute paths indeed can use either one, in some other
contexts - such as a function name - only "name" is allowed, so we
need to distinguish between two types of tokens: NAME and NAMEREF.
While separating those, I noticed that we incorrectly allowed a "#"
followed by *zero* alphanumeric characters to be considered a NAMEREF,
which it shouldn't. In other words, NAMEREF should have ALNUM+, not ALNUM*.
Same for VALREF, which can't be just a ":" with nothing after it.
So this patch fixes these mistakes, and adds tests for them.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
DynamoDB complains, and fails an update, if the update contains in
ExpressionAttributeNames or ExpressionAttributeValues names which aren't
used by the expression.
Let's do the same, although sadly this means more work to track which
of the references we've seen and which we haven't.
This patch makes two previously xfail (expected fail) tests become
successful tests on Alternator (they always succeeded against DynamoDB).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The existing tests in test_update_expression.py thoroughly tested the
UpdateExpression features which we currently support. But tests for
features which Alternator *doesn't* yet support were partial.
In this patch, we add a large number of new tests to
test_update_expression.py aiming to cover ALL the features of
UpdateExpression, regardless of whether we already support it in
Alternator or not. Every single feature and esoteric edge-case I could
discover is covered in these tests - and as far as I know these tests
now cover the *entire* UpdateExpression feature. All the tests succeed
on DynamoDB, and confirm our understanding of what DynamoDB actually does
on all these cases.
After this patch, test_update_expression.py is a whopper, with 752 lines of
code and 37 separate test functions. 23 out of these 37 tests are still
"xfail" - they succeed on DynamoDB but fail on Alternator, because of
several features we are still missing. Those missing features include
direct updates of nested attributes, read-modify-write updates (e.g.,
"SET a=b" or "SET a=a+1"), functions (e.g., "SET a = list_append(a, :val)"),
the ADD and DELETE operations on sets, and various other small missing
pieces.
The benefit of this whopper test is two-fold: First, it will allow us
to test our implementation as we continue to fill it (i.e., "test-
driven development"). Second, all these tested edge cases basically
"reverse engineer" how DynamoDB's expression parser is supposed to work,
and we will need this knowledge to implement the still-missing features of
UpdateExpression.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch adds an extensive array of tests for UpdateItem's UpdateExpression
support, which was introduced in the previous patch.
The tests include verification of various edge cases of the parser, support
for ":value" and "#name" references, functioning SET and REMOVE operations,
combinations of multiple such operations, and much more.
As usual, all these tests were ran and succeed on DynamoDB, as well as on
Alternator - to confirm Alternator behaves the same as DynamoDB.
There are two tests marked "xfail" (expected to fail), because Alternator
still doesn't support the attribute copy syntax (e.g., "SET a = b",
doing a read-before-write).
There are some additional areas which we don't support - such as the DELETE
and ADD operations or SET with functions - but those areas aren't yet test
in these tests.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
For the UpdateItem operation, so far we supported updates via the
AttributeUpdates parameter, specifying which attributes to set or remove
and how. But this parameter is considered deprecated, and DynamoDB supports
a more elaborate way to modify attributes, via an "UpdateExpression".
In the previous patch we added a function to parse such an UpdateExpression,
and in this patch we use the result of this parsing to actually perform
the required updates.
UpdateExpression is only partially supported after this patch. The basic
"SET" and "REMOVE" operations are supported, but various other cases aren't
fully supported and will be fixed in followup patches. The following
patch will add extensive tests to confirm exactly what works correctly
with the new UpdateExpression support.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The DynamoDB protocol is based on JSON, and most DynamoDB requests describe
the operation and its parameters via JSON objects such as maps and lists.
However, in some types of requests an "expression" is passed as a single
string, and we need to parse this string. These cases include:
1. Attribute paths, such as "a[3].b.c", are used in projection
expressions as well as inside other expressions described below.
2. Condition expressions, such as "(NOT (a=b OR c=d)) AND e=f",
used in conditional updates, filters, and other places.
3. Update expressions, such as "SET #a.b = :x, c = :y DELETE d"
This patch introduces the framework to parse these expressions, and
an implementation of parsing update expressions. These update expressions
will be used in the UpdateItem operation in the next patch.
All these expression syntaxes are very simple: Most of them could be
parsed as regular expressions, or at most a simple hand-written lexical
analyzer and recursive-descent parser. Nevertheless, we decided to specify
these parsers in the same ANTLR3 language already used in the Scylla
project for parsing CQL, hopefully making these parsers easier to reason
about, and easier to change if needed - and reducing the amount of boiler-
plate code.
The parsing of update expressions is most complete except that in SET
actions, only the "path = value" form is supported and not yet forms
forms such as "path1 = path2" (which does read-before-write) or
"path1 = path1 + value" or "path = function(...)".
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
We need to write more tests for various case of handling
nested documents and nested attributes. Let's collect them
all in the same test file.
This patch mostly moves existing code, but also adds one
small test, test_nested_document_attribute_write, which
just writes a nested document and reads it back (it's
mostly covered by the existing test_put_and_get_attribute_types,
but is specifically about a nested document).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
We usually run Alternator tests against the local Alternator - testing
against AWS DynamoDB is rarer, and usually just done when writing the
test. So let's make "pytest" without parameters default to testing locally.
To test against AWS, use "pytest --aws" explicitly.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Attributes for reads (GetItem, Query, Scan, ...) and writes (PutItem,
UpdateItem, ...) are now serialized and deserialized in binary form
instead of raw JSON, provided that their type is S, B, BOOL or N.
Optimized serialization for the rest of the types will be introduced
as follow-ups.
Message-Id: <6aa9979d5db22ac42be0a835f8ed2931dae208c1.1559646761.git.sarna@scylladb.com>
Attributes used to be written into the database in raw JSON format,
which is far from optimal. This patch introduces more robust
serializationi routines for simple alternator types: S, B, BOOL, N.
Serialization uses the first byte to encode attribute type
and follows with serializing data in binary form.
More complex types (sets, lists, etc.) are currently still
serialized in raw JSON and will be optimized in follow-up patches.
Message-Id: <10955606455bbe9165affb8ac8fba4d9e7c3705f.1559646761.git.sarna@scylladb.com>
For some unknown reason we put the list of alternator source files
in configure.py inside the "api" list. Let's move it into a separate
list.
We could have just put it in the scylla_core list, but that would cause
frequent and annoying patch conflicts when people add alternator source
files and Scylla core source files concurrently.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
So far for UpdateItem we only supported the old-style AttributeUpdates
parameter, not the newer UpdateExpression. This patch begins the path
to supporting UpdateExpression. First, trying to use *both* parameters
should result in an error, and this patch does this (and tests this).
Second, passing neither parameters is allowed, and should result in
an *empty* item being created.
Finally, since today we do not yet support UpdateExpression, this patch
will cause UpdateItem to fail if UpdateExpression is used, instead of
silently being ignored as we did so far.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch adds two simple tests for nested documents, which pass:
test_nested_document_attribute_overwrite() tests what happens when
we UpdateItem a top-level attribute to a dictionary. We already tested
this works on an empty item in a previous test, but now we check what
happens when the attribute already existed, and already was a dictionary,
and now we update it to a new dictionary. In the test attribute a was
{b:3, c:4} and now we update it to {c:5}. The test verifies that the new
dictionary completely replaces the old one - the two are not merged.
The new value of the attribute is just {c:5}, *not* {b:3, c:5}.
The second test verifies that the AttributeUpdates parameter of
UpdateItem cannot be used to update a just a nested attributes.
Any dots in the attribute name are considered an actual dot - not
part of a path of attribute names.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Comparing two lists of items without regard for order is not trivial.
For this reason some tests in test_query.py only compare arrays of sort
keys, and those tests are fine.
But other tests used a trick of converting a list of items into a
of set_of_frozen_elements() and compare this sets. This trick is almost
correct, but it can miss cases where items repeat.
So in this patch, we replace the set_of_frozen_elements() approach by
a similar one using a multiset (set with repetitions) instead of a set.
A multiset in Python is "collections.Counter". This is the same approach
we started to also used in test_scan.py in a recent patch.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Remove the incomplete and unused function to convert DynamoDB type names
to ScyllaDB type objects:
DynamoDB has a different set of types relevant for keys and for attributes.
We already have a separate function, parse_key_type(), for parsing key
types, and for attributes - we don't currently parse the type names at
all (we just save them as JSON strings), so the function we removed here
wasn't used, and was in fact #if'ed out. It was never completed, and it now
started to decay (the type for numbers is wrong), so we're better off
completely removing it.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch implements a fully working number type for keys, and now
Alternator fully and correctly supports every key type - strings, byte
arrays, and numbers.
The patch also adds a test which verifies that Scylla correctly sorts
number sort keys, and also correctly retrieves them to the full precision
guaranteed by DynamoDB (38 decimal digits).
The implementation uses Scylla's "decimal" type, which supports arbitrary
precision decimal floating point, and in particular supports the precision
specified by DynamoDB. However, "decimal" is actually over-qualified for
this use, so might not be optimal for the more specific requirements of
DynamoDB. So a FIXME is left to optimize this case in the future.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Comparing two lists of items without regard for order is not trivial.
test_scan.py currently has two ways of doing this, both unsatisfactory:
1. We convert each list to a set via set_of_frozen_elements(), and compare
the sets. But this comparison can miss cases where items repeat.
2. We use sorted() on the list. This doesn't work on Python 3 because
it removed the ability to compare (with "<") dictionaries.
So in this patch, we replace both by a new approach, similar to the first
one except we use a multiset (set with repetitions) instead of a set.
A multiset in Python is "collections.Counter".
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Creating and deleting tables is the slowest part of our tests,
so we should lower the number of tables our tests create.
We had a "test_2_tables" fixture as a way to create two
tables, but since our tests already create other tables
for testing different key types, it's faster to reuse those
tables - instead of creating two more unused tables.
On my system, a "pytest --local", running all 38 tests
locally, drops from 25 seconds to 20 seconds.
As a bonus, we also have one fewer fixture ;-)
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
to 1024 bytes, and the entire item to 400 KB which therefore also
limits the size of one attribute. This test checks that we can
reach up to these limits, with binary keys and attributes.
The test does *not* check what happens once we exceed these
limits. In such a case, DynamoDB throws an error (I checked that
manually) but Alternator currently simply succeeds. If in the
future we decide to add artificial limits to Alternator as well,
we should add such tests as well.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
"len" is an unfortunate choice for a variable name, in case one
day the implementation may want to call the built-in "len" function.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
We already have a test for *string* sort-key ordering of items returned
by the Scan operation, and this test adds a similar test for the Query
operation. We verify that items are retrieved in the desired sorted
order (sorted by the aptly-named sort key) and not in creation order
or any other wrong order.
But beyond just checking that Query works as expected (it should,
given it uses the same machinary as Scan), the nice thing about this
test is that it doesn't create a new table - it uses a shared table
and creates one random partition inside it. This makes this test
faster and easier to write (no need for a new fixture), and most
importantly - easily allows us to write similar tests for other
key types.
So this patch also tests the correct ordering of *binary* sort keys.
It helped exposed bugs in previous versions of the binary key implementation.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Simple tests for item operations (PutItem, GetItem) with binary key instead
of string for the hash and sort keys. We need to be able to store such
keys, and then retrieve them correctly.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Until now we only supported string for key columns (hash or sort key).
This patch adds support for the bytes type (a.k.a binary or blob) as well.
The last missing type to be supported in keys is the number type.
Note that in JSON, bytes values are represented with base64 encoding,
so we need to decode them before storing the decoded value, and re-encode
when the user retrieves the value. The decoding is important not just
for saving storage space (the encoding is 4/3 the size of the decoded)
but also for correct *sorting* of the binary keys.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The DynamoDB API uses base64 encoding to encode binary blobs as JSON
strings. So we need functions to do these conversions.
This code was "inspired" by https://github.com/ReneNyffenegger/cpp-base64
but doesn't actually copy code from it.
I didn't write any specific unit tests for this code, but it will be
exercised and tested in a following patch which tests Alternator's use
of these functions.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
BEGINS_WITH behaves in a special way when a key postfix
consists of <255> bytes. The initial test does not use that
and instead checks UTF-8 characters, but once bytes type
is implemented for keys, it should also test specifically for
corner cases, like strings that consist of <255> byte only.
Message-Id: <fe10d7addc1c9d095f7a06f908701bb2990ce6fe.1558603189.git.sarna@scylladb.com>
BEGINS_WITH statement increments a string in order to compute
the upper bound for a clustering range of a query.
Unfortunately, previous implementation was not correct,
as it appended a <0> byte if the last character was <255>,
instead of incrementing a last-but-one character.
If the string contains <255> bytes only, the upper bound
of the returned upper bound is infinite.
Message-Id: <3a569f08f61fca66cc4f5d9e09a7188f6daad578.1558524028.git.sarna@scylladb.com>
We had several places in the code that need to parse the
ConsistentRead flag in the request. Let's add a function
that does this, and while at it, checks for more error
cases and also returns LOCAL_QUORUM and LOCAL_ONE instead
of QUORUM and ONE.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
As Shlomi suggested in the past, it is more likely that when we
eventually support global tables, we will use LOCAL_QUORUM,
not QUORUM. So let's switch to that now.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
So far, all of the tests in test_item.py (for PutItem, GetItem, UpdateItem),
were arbitrarily done on a test table with both hash key and sort key
(both with string type). While this covers most of the code paths, we still
need to verify that the case where there is *not* a sort key, also works
fine. E.g., maybe we have a bug where a missing clustering key is handled
incorrectly or an error is incorrectly reported in that case?
But in this patch we add tests for the hash-key-only case, and see that
it already works correctly. No bug :-)
We add a new fixture test_table_s for creating a test table with just
a single string key. Later we'll probably add more of these test tables
for additional key types.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Another type of key type error can be to forget part of the key
(the hash or sort key). Let's test that too (it already works correctly,
no need to patch the code).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
When a table has a hash key or sort key of a certain type (this can
be string, bytes, or number), one cannot try to choose an item using
values of different types.
We previously did not handle this case gracefully, and PutItem handled
it particularly bad - writing malformed data to the sstable and basically
hanging Scylla. In this patch we fix the pk_from_json() and ck_from_json()
functions to verify the expected type, and fail gracefully if the user
sent the wrong type.
This patch also adds tests for these failures, for the GetItem, PutItem,
and UpdateItem operations.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
According to the documentation, trying to GetItem a non-existant item
should result in an empty response - NOT a response with an empty "Item"
map as we do before this patch.
This patch fixes this case, and adds a test case for it. As usual,
we verify that the test case also works on Amazon DynamoDB, to verify
DynamoDB really behaves the way we thik it does.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
If an empty item (i.e., no attributes except the key) is created, or an item
becomes empty (by deleting its existing attributes), the empty item must be
maintained - it cannot just disappear. To do this in Scylla, we must add a
row marker - otherwise an empty attribute map is not enough to keep the
row alive.
This patch includes 4 test cases for all the various ways an empty item can be
created empty or non-empty item be emptied, and verifies that the empty item
can be correctly retrieved (as usual, to verify that our expectation of
"correctness" is indeed correct, we run the same tests against DynamoDB).
All these 4 tests failed before this patch, and now succeed.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
These lines of codes were superfluous and their result unused: the
make_item_mutation() function finds the pk and ck on its own.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
his patch adds a statistics framework to Alternator: Executor has (for
each shard) a _stats object which contains counters for various events,
and also is in charge of making these counters visible via Scylla's regular
metrics API (http://localhost:9180/metrics).
This patch includes a counter for each of DynamoDB's operation types,
and we increase the ones we support when handled. We also added counters
for total operations and unsupported operations (operation types we don't
yet handle). In the future we can easily add many more counters: Define
the counter in stats.hh, export it in stats.cc, and increment it in
where relevant in executor.cc (or server.cc).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Ask to retrieve only an attribute name which *none* of the items have.
The result should be a silly list of empty items, and indeed it is.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Use full_scan() in another test instead of open-coding the scan.
There are two more tests that could have used full_scan(), but
since they seem to be specifically adding more assertions or
using a different API ("paginators"), I decided to leave them
as-is. But new tests should use full_scan().
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This is a short, but extensive, test to the AttributesToGet parameter
to Scan, allowing to select for output only some of the attributes.
The AttributesToGet feature has several non-obvious features. Firstly,
it doesn't require that any key attributes be selected. So since each
item may have different non-key attributes, some scanned items may
be missing some of the selected columns, and some of the items may
even be missing *all* the selected columns - in which case DynamoDB
returns an empty item (and doesn't entirely skip this item). This
test covers all these cases, and it adds yet another item to the
'filled_test_table' fixture, one which has different attributes,
so we can see these issues.
As usual, this test passes in both DynamoDB and Alternator, to
assure we correspond to the *right* behavior, not just what we
think is right.
This test actually exposed a bug in the way our code returned
empty items (items which had none of the selected columns),
a bug which was fixed by the previous patch.
Instead of having yet another copy of table-scanning code, this
patch adds a utility function full_scan(), to scan an entire
table (with optional extra parameters for the scan) and return
the result as an array. We should simply existing tests in
test_scan.py by using this new function.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
* tag 'dbuild-image-help-usage-v1' of github.com:bhalevy/scylla:
dbuild: add usage
dbuild: add help option
dbuild: list available images when no image arg is given
dbuild: add --image option
When a Scan selects only certain attributes, and none of the key
attributes are selected, for some of the scanned items *nothing*
will remain to be output, but still Dynamo outputs an empty item
in this case. Our code had a bug where after each item we "moved"
the object leaving behind a null object, not an empty map, so a
completely empty item wasn't output as an empty map as expected,
and resulted in boto3 failing to parse the response.
This simple one-line patch fixes the bug, by resetting the item
to an empty map after moving it out.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Instead of blindly returning "localhost:8000" in response to
DescribeEndpoints and for sure causing us problems in the future,
the right thing to do is to return the same domain name which the
user originally used to get to us, be it "localhost:8000" or
"some.domain.name:1234". But how can we know what this domain name
was? Easy - this is why HTTP 1.1 added a mandatory "Host:" header,
and the DynamoDB driver I tested (boto3) adds it as expected,
indeed with the expected value of "localhost:8000" on my local setup.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Although different partitions are returned by a Scan in (seemingly)
random order, items in a single partition need to be returned sorted
by their sort key. This adds a test to verify this.
This patch adds to the filled_test_table fixture, which until now
had just one item in each partition, another partition (with the key
"long") with 164 additional items. The test_scan_sort_order_string
test then scans this table, and verifies that the items are really
returned in sorted order.
The sort order is, of course, string order. So we have the first
item with sort key "1", then "10", then "100", then "101", "102",
etc. When we implement numeric keys we'll need to add a version
of this test which uses a numeric clustering key and verifies the
sort order is numeric.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Because of a typo, we incorrectly set the table's sort key as a second
partition key column instead of a clustering key column. This has bad
but subtle consequences - such as that the items are *not* sorted
according to the sort key. So in this patch we fix the typo.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
DescribeEndpoints is not a very important API (and by default, clients
don't use it) but I wanted to understand how DynamoDB responds to it,
and what better way than to write a test :-)
And then, if we already have a test, let's implement this request in
Scylla as well. This is a silly implementation, which always returns
"localhost:8000". In the future, this will need to be configurable -
we're not supposed here to return *this* server's IP address, but rather
a domain name which can be used to get to all servers.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
"
Currently, GDB scripts locate sstables by scanning the heap for
bag_sstable_set containers. That has disadvatanges:
- not all containers are considered
- it's extremely slow on large heaps
- fragile, new containers can be added, and we won't even know
This series fixes all above by adding a per-shard sstable tracker
which tracks sstable objects in a linked-list.
"
* 'sstable-tracker' of github.com:tgrabiec/scylla:
gdb: Use sstable tracker to get the list of sstables
gdb: Make intrusive_list recognize member_hook links
sstables: Track whether sstable was already open or not
sstables: Track all instances of sstable objects
sstables: Make sstable object not movable
sstables: Move constructor out of line
Most of the request types need to a TableName parameter, specifying the
name of the table they operate on. There's a lot of boilerplate code
required to get this table name and verify that it is valid (the parameter
exists, is a string, passes DynamoDB's naming rules, and the table
actually exists), which resulted in a lot of code duplication - and
in some cases missing checks.
So this patch introduces two utility functions, get_table_name()
and get_table(), to fetch a table name or the schema of an existing
table, from the request, with all necessary validation. If validation
fails, the appropriate api_error() is thrown so the user gets the
right error message.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Remove unused random-string code from conftest.py, and also add a
TODO comment how we should speed up filled_test_table fixture by
using a batch write - when that becomes available in Alternator.
(right now this fixture takes almost 4 seconds to prepare on a local
Alternator, and a whopping 3 minutes (!) to prepare on DynamoDB).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The test test_put_and_get_attribute_types needlessly named all the
different attributes and their variables, causing a lot of repetition
and chance for mistakes when adding additional attributes to the test.
In this rewrite, we only have a list of items, and automatically build
attributes with them as values (using sequential names for the attributes)
and check we read back the same item (Python's dict equality operator
checks the equality recursively, as expected).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Although we planned to initially support only string types, it turns out
for the attributes (*not* the key), we actually support all types already,
including all scalar types (string, number, bool, binary and null) and
more complex types (list, nested document, and sets).
This adds a tests which PutItem's these types and verifies that we can
retrieve them.
Note that this test deals with top-level attributes only. There is no
attempt to modify only a nested attribute (and with the current code,
it wouldn't work).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
In our tests, we cannot really assume that ListTables should returns *only*
the tables we created for the test, or even that a page size of 100 will
be enough to list our 3 pages. The issue is that on a shared DynamoDB, or
in hypothetical cases where multiple tests are run in parallel, or previous
tests had catestrophic errors and failed to clean up, we have no idea how
many unrelated tables there are in the system. There may be hundreds of
them. So every ListTables test will need to use paging.
So in this re-implementation, we begin with a list_tables() utility function
which calls ListTables multiple times to fetch all tables, and return the
resulting list (we assume this list isn't so huge it becomes unreasonable
to hold it in memory). We then use this utility function to fetch the table
list with various page sizes, and check that the test tables we created are
listed in the resulting list.
There's no longer a separate test for "all" tables (really was a page of 100
tables) and smaller pages (1,2,3,4) - we now have just one test that does the
page sizes 1,2,3,4, 50 and 100.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This patch cleans up some comments and reorganizes some functions in
conftest.py, where the test_table fixture was defined. The goal is to
later add additional types of test tables with different schemas (e.g.,
just a partition key, different key types, etc.) without too much
code duplication.
This patch doesn't change anything functional in the tests, and they
still pass ("pytest --local" runs all tests against the local Alternator).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The ck_from_json() utility function is easier to use if it handles
the no-clustering-key case as the callers need them too, instead of
requiring them to handle the no-clustering-key case separately.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
So far we supported UpdateItem only with PUT operations - this patch
adds support for DELETE operations, to delete specific attributes from
an item.
Only the case of a missing value is support. DynamoDB also provides
the ability to pass the old value, and only perform the deletion if
the value and/or its type is still up-to-date - but we don't support
this yet and fail such request if it is attempted.
This patch also includes a test for this case in alternator-test/
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Add initial tests for UpdateItem. Only the features currently supported
by our code (only string attributes, only "PUT" action) are tested.
As usual, this test (like all others) was tested to pass on both DynamoDB
and Alternator.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Add an initial UpdateItem implementation. As PutItem and GetItem we
are still limited to string attributes. This initial implementation
of UpdateItem implements only the "PUT" action (not "DELETE" and
certainly not "ADD") and not any of the more advanced options.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
All operation-generated error messages should have the 400 HTTP error
code. It's a real nag to have to type it every time. So make it the
default.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Without special options, PutItem should return nothing (an empty
JSON result). Previously we had trouble doing this, because instead
of return an empty JSON result, we converted an empty string into
JSON :-) So the existing code had an ugly workaround which worked,
sort of, for the Python driver but not for the Java driver.
The correct fix, in this patch, is to invent a new type json_string
which is a string *already* in JSON and doesn't need further conversion,
so we can use it to return the empty result. PutItem now works from
YCSB's Java driver.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Although we would like to allow table names up to 222 bytes, this is not
currently possible because Scylla tacks additional 33 bytes to create
a directory name, and directory names are limited to 255 bytes.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The supported key types are just S(tring), B(lob), or N(umber).
Other types are valid for attributes, but not for keys, and should
not be accepted. And wrong types used should result in the appropriate
user-visible error.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
To be correct, CreateTable's input parsing need to work in reverse from
what it did: First, the key columns are listed in KeySchema, and then
each of these (and potetially more, e.g., from indexes) need to appear
AttributeDefinitions.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Without any arguments, PutItem should return no data at all. But somehow,
for reasons I don't understand, the boto3 driver gets confused from an
empty JSON thinking it isn't JSON at all. If we return a structure with
an empty "attributes" fields, boto3 is happy.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Add an initial implementation of Delete table, enough for making the
pytest --local test_table.py::test_create_and_delete_table
Pass.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
When given an unknown operation (we didn't implement yet many of them...)
we should throw the appropriate api_error, not some random exception.
This allows the client to understand the operation is not supported
and stop retrying - instead of retrying thinking this was a weird
internal error.
For example the test
pytest --local test_table.py::test_create_and_delete_table
Now fails immediately, saying Unsupported operation DeleteTable.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The structure's name in DescribeTable's output is supposed to be called
"Table", not "TableDescription". Putting in the wrong place caused the
driver's table creation waiters to fail.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
validate table name in CreateTable, and if it doesn't fit DynamoDB's
requirement, return the appropriate error as drivers expect.
With this patch, test_table.py::test_create_table_unsupported_names
now passes (albeit with a one minute pause - this a bug with keep-alive
support...).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Check the expected error message to contain just ValidationException
instead of an overly specific text message from DynamoDB, so we aren't
so constraint in our own messages' wording.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Dynamo allows tables names up to 255 characters, but when this is tested on
Alternator, the results are disasterous: mkdir with such a long directory
name fails, Scylla considers this an unrecoverable "I/O error", and exits
the server.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Start to use "test fixtures" defined in conftest.py: The connection to
the DynamoDB API, and also temporary tables, can be reused between multiple
tests.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
This initial implementation is enough to pass a test of getting a
failure for a non-existant table -
test_table.py::test_describe_table_non_existent_table
and to recognize an existing table. But it's still missing a lot
of fields for an existing table (among others, the schema).
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Exceptions from the handlers need to be output in a certain way - as
a JSON with specific fields - as DynamoDB drivers expect them to be.
If a handler throws an alternator::api_error with these specific fields,
they are output, but any other exception is converted into the same
format as an "Internal Error".
After this patch, executor code can throw an alternator::api_error and
the client will receive this error in the right format.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
DynamoDB error messages are returned in JSON format and expect specific
information: Some HTTP error code (often but not always 400), a string
error "type" and a user-readable message. Code that wants to return
user-visible exceptions should use this type, and in the next patch we
will translate it to the appropriate JSON string.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The "Timestamp" type returned for CreationDateTime can be one of several
things but if it is a number, it is supposed to be the time in *seconds*
since the epoch - not in milliseconds. Returning milliseconds as we
wrongly did causes boto3 (AWS's Python driver) to throw a parse exception
on this response.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Until now, we always opened the Alternator port along with Scylla's
regular ports (CQL etc.). This should really be made optional.
With this patch, by default Alternator does NOT start and does not
open a port. Run Scylla with --alternator-port=8000 to open an Alternator
API port on port 8000, as was the default until now. It's also possible
to set this in scylla.yaml.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
The interface works on port 8000 by default and provides
the most basic alternator operations - it's an incomplete
set without validation, meant to allow testing as early as possible.
Some sstable objects correspond to sstables which are being written
and are not sealed yet. Such sstables don't have all the fields
filled-in. Tools which calculate statistics (like GDB scripts) need to
distinguish such sstables.
There is no reason to keep parts of the the Scylla Metadata component in memory
after it is read, parsed, and its information fed into the SSTable.
We have seen systems in which the Scylla metadata component is one
of the heaviest memory users, more than the Summary and Filter.
In particular, we use the token metadata, which is the largest part of the
Scylla component, to calculate a single integer -> the shards that are
responsible for this SSTable. Once we do that, we never use it again
Tests: unit (release/debug), + manual scylla write load + reshard.
Fixes#4951
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Introduce mutation_fragment_stream_validator class and use it as a
Filter to flat_mutation_reader::consume_in_thread from
sstable::write_components to validate partition region and optionally
clustering key monotonicity.
Fixes#4803
key monotonicity validation requires an overhead to store the last key and also to compare
therefore provide an option to enable/disable it (disabled by default).
Refs #4804
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Storing and comparing keys is expensive.
Add a flag to enable/disable this feature (disabled by default).
Without the flag, only the partition region monotonicity is
validated, allowing repeated clustering rows, regardless of
clustering keys.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
The respective constructor is explicit.
Define this assignment operator to be used by flat_mutation_reader
mutation_fragment_stream_validator filter so that it can use
mutation_fragment::position() verbatim and keep its internal
state as position_in_partition.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Recently we started to use more the concept of metric labels - several
metrics which share the same name, but differ in the value of some label
such a "group" (for different scheduling groups).
This patch documents this feature in docs/metrics.md, gives the example of
scheduling groups, and explains a couple more relevant Promethueus syntax
tricks.
Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20190909113803.15383-1-nyh@scylladb.com>
* seastar cb7026c16f...b3fb4aaab3 (10):
> Revert "scheduling groups: Adding per scheduling group data support"
> scheduling groups: Adding per scheduling group data support
> rpc: check that two servers are not created with the same streaming id
> future: really ignore exceptions in ignore_ready_future
> iostream: Constify eof() function
> apply.hh: add missing #include for size_t
> scheduling_group_demo: add explicit yields since future::get() no longer does
> Fix buffer size used when calling accept4()
> future-util: reduce allocations and continuations in parallel_for_each
> rpc: lz4_decompressor: Add a static constexpr variable decleration for Cpp14 compatibility
Previously, the gate could get
closed too early, which would result in shutting down the server
before it had an opportunity to respond to the client.
Refs #4818
"
The release notes for boost 1.67.0 includes:
Breaking Change: When converting a multiprecision integer to a narrower type, if the value is too large (or negative) to fit in the smaller type, then the result is either the maximum (or minimum) value of the target
Since we just moved out of boost 1.66, we have to update our code.
This fixes issue #4960
"
* 'espindola/fix-4960' of https://github.com/espindola/scylla:
types: fix varint to integer conversion
types: extract a from_varint_to_integer from make_castas_fctn_from_decimal_to_integer
types: fix decimal to integer conversion
types: extract helper for converting a decimal to a cppint
types: rename and detemplate make_castas_fctn_from_decimal_to_integer
"
With this patch series one has to be explicit to create a date_type_impl and now there is only the one documented difference between date_type_impl and timestamp_type_impl.
"
* 'espindola/simplify-date-type' of https://github.com/espindola/scylla:
types: Reduce duplication around date_type_impl
types: Don't use date_type_native_type when we want a timestamp
types: Remove timestamp_native_type
types: Don't specialize data_type_for for db_clock::time_point
types: Make it harder to create date_type
According to the comments, the only different between date_type_impl
and timestamp_type_impl is the comparison function.
This patch makes that explicit by merging all code paths except:
* The warning when converting between the two
* The compare function
The date_type_impl type can still be user visible via very old
sstables or via the thrift protocol. It is not clear if we still need
to support either, but with this patch it is easy to do so.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
In these cases it is pretty clear that the original code wanted to
create a timestamp_type data_value but was creating a date_type one
because of the old defaults.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Now that we know that anything expecting a date_type has been
converted to date_type_native_type, switch to using
db_clock::time_point when we want a timestamp_type.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
This also moves every user to date_type_native_type. A followup patch
will convert to timestamp_type when appropriate.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
date_type was replaced with timestamp_type, but it was very easy to
create a date_type instead of a timestamp_type by accident.
This patch changes the code so that a date_type is no longer
implicitly used when constructing a data_value. All existing code that
was depending on this is converted to explicitly using
date_type_native_type. A followup patch will convert to timestamp_type
when appropriate.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Commit 7e3805ed3d removed the load balancing code from cql
server, but it did not remove most of the craft that load balancing
introduced. The most of the complexity (and probably the main reason the
code never worked properly) is around service::client_state class which
is copied before been passed to the request processor (because in the past
the processing could have happened on another shard) and then merged back
into the "master copy" because a request processing may have changed it.
This commit remove all this copying. The client_request is passed as a
reference all the way to the lowest layer that needs it and it copy
construction is removed to make sure nobody copies it by mistake.
tests: dev, default c-s load of 3 node cluster
Message-Id: <20190906083050.GA21796@scylladb.com>
"
This avoids a double dispatch on _kind and also removes a few shared_ptr copies.
The extra work was a small regression from the recent types refactoring.
"
* 'espindola/optimize_type_find' of https://github.com/espindola/scylla:
types: optimize type find implementation
types: Avoid shared_ptr copies
Currently when an error happens during the receive and distribute phase
it is swallowed and we just return a -1 status to the remote. We only
log errors that happen during responding with the status. This means
that when streaming fails, we only know that something went wrong, but
the node on which the failure happened doesn't log anything.
Fix by also logging errors happening in the receive and distribute
phase. Also mention the phase in which the error happened in both error
log messages.
Refs: #4901
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190903115735.49915-1-bdenes@scylladb.com>
The previous code was using the boost::multiprecision::cpp_int to
integer conversion, but that doesn't have the same semantics an cql
for signed numbers.
This fixes the dtest cql_cast_test.py:CQLCastTest.cast_varint_test.
Fixes#4960
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
The previous code was using the boost::multiprecision::cpp_rational to
integer conversion, but that doesn't have the same semantics an cql.
This patch avoids creating a cpp_rational in the first place and works
just with integers.
This fixes the dtest cql_cast_test.py:CQLCastTest.cast_decimal_test.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
on_down() iterates over _view_update_handlers_list, but it yields during iteration,
and while it yields, elements in that list can be removed, resulting in a
use-after-free.
Prevent this by registering iterators that can be potentially invalidated, and
any time we remove an element from the list, check whether we're removing an element
that is being pointed to by a live iterator. If that is the case, advance the iterator
so that it points at a valid element (or at the end of the list).
Fixes#4912.
Tests: unit (dev)
Currently the background schema sync (push/pull) uses frozen mutation to
send the schema mutations over the wire to the remote node. For this to
work correctly, both nodes have to have the exact same schema for the
system schema tables, as attempting to unpack the frozen mutation with
the wrong schema leads to undefined behaviour.
To avoid this and to ensure syncing schema between nodes with different
schema table schema versions is defined we migrate the background
schema sync to use canonical mutations for the transfer of the schema
mutations. Canonical mutations are immune to this problem, as they
support deserializing with any version of the schema, older or newer
one.
The foreground schema sync mechanisms -- the on-demand schema pulls on
reads and writes -- already use canonical mutations to transmit the
schema mutations.
It is important to note that due to this change, column-level
incompatibilities between the schema mutations and the schema used to
deserialize them will be hidden. This is undesired and should be fixed
in a follow-up (#4956). Table level incompatibilities are detected and
schema mutations containing such mutations will be rejected just like before.
This patch adds canonical mutation support to the two background schema
sync verbs:
* `DEFINITIONS_UPDATE` (schema push)
* `MIGRATION_REQUEST` (schema pull)
Both verbs still support the old frozen mutation schema transfer, albeit
that path is now much less efficient. After all nodes are upgraded, the
pull verb can effectively avoid sending frozen mutations altogether,
completely migrating to canonical mutations. Unfortunately this was not
possible for the push verb, so that one now has an overhead as it needs
to send both the frozen and canonical mutations.
Fixes: #4273
The previous code was not exception safe and would eventually cause a
file to be destroyed without being closed, causing an assert failure.
Unfortunately it doesn't seem to be possible to test this without
error injection, since using an invalid directory fails before this
code is executed.
Fixes#4948
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20190904002314.79591-1-espindola@scylladb.com>
The verbs are:
* DEFINITIONS_UPDATE (push)
* MIGRATION_REQUEST (pull)
Support was added in a backward-compatible way. The push verb, sends
both the old frozen mutation parameter, and the new optional canonical
mutation parameter. It is expected that new nodes will use the latter,
while old nodes will fall-back to the former. The pull verb has a new
optional `options` parameter, which for now contains a single flag:
`remote_supports_canonical_mutation_retval`. This flag, if set, means
that the remote node supports the new canonical mutation return value,
thus the old frozen mutations return value can be left empty.
In preparation to the schema push/pull migrating to use canonical
mutations, convert the method producing the schema mutations to return a
vector of canonical mutations. The only user, MIGRATION_REQUEST verb,
converts the canonical mutations back to frozen mutations. This is very
inefficient, but this path will only be used in mixed clusters. After
all nodes are upgraded the verb will be sending the canonical mutations
directly instead.
This turns find into a template so there is only one switch over the
kind of each type in the search.
To evaluate the change in code size sizes, I added [[noinline]] to
find and obtained the following results.
The release columns for release in the before case have an extra column
because the functions are sufficiently complex to trigger gcc to split
them in hot + cold.
before:
dev release (hot + cold split)
find 0x35f = 863 0x3d5 + 0x112 = 1255
references_duration 0x62 + 0x22 + 0x8 = 140 0x55 + 0x1f + 0x2a + 0x8 = 166
references_user_type 0x6b + 0x26 + 0x111 = 418 0x65 + 0x1f + 0x32 + 0x11b = 465
after:
dev release
find 0xd6 + 0x1b4 = 650 0xd2 + 0x1f5 = 711
references_duration 0x13 = 19 0x13 = 19
references_user_type 0x1a = 26 0x21 = 33
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
They are somewhat expensive (in code size at least) and not needed
everywhere.
Inside the getter the variables are 'const data_type&', so we can
return that. Everything still works when a copy is needed, but in code
that just wants to check a property we avoid the copy.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
During CQL request processing, a gate is used to ensure that
the connection is not shut down until all ongoing requests
are done. However, the gate might have been left too early
if the database was not ready to respond immediately - which
could result in trying to respond to an already closed connection
later. This issue is solved by postponing leaving the gate
until the continuation chain that handles the request is finished.
Refs #4808
* 'cleanup_sstables' of https://github.com/asias/scylla:
sstables: Move leveled_compaction_strategy implementation to source file
sstables: Include dht/i_partitioner.hh for dht::partition_range
Since nonroot mode requires to run everything on non-privileged user,
most of setup scripts does not able to use nonroot mode.
We only provide following functions on nonroot mode:
- EC2 check
- IO setup
- Node exporter installer
- Dev mode setup
Rest of functions will be skipped on scylla_setup.
To implement nonroot mode on setup scripts, scylla_util provides
utility functions to abstract difference of directory structure between normal
installation and nonroot mode.
Since systemd unit can override parameters using drop-in unit, we don't need
mustache template for them.
Also, drop --disttype and --target options on install.sh since it does not
required anymore, introduce --sysconfdir instead for non-redhat distributions.
Since ac9b115, we switched to install.sh on Debian so we don't rely on .deb
specific packaging scripts anymore.
Signed-off-by: Takuya ASADA <syuu@scylladb.com>
Now that compaction returns only after the compacted sstables are
deleted we no longer need to stop the base to force waiting
for deletes (that were previously done asynchronously)
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Merged patch series by Amnon Heiman:
This patch fixes a bug that a map is held on the stack and then is used
by a future.
Instead, the map is now moved to the relevant lambda function.
Fixes#4824
Stopping services which occurs in a destructor of deferred_action
should not throw, or it will end the program with
terminate(). View builder breaks a semaphore during its shutdown,
which results in propagating a broken_semaphore exception,
which in turn results in throwing an exception during stop().get().
In order to fix that issue, semaphore exceptions are explicitly
ignored, since they're expected to appear during shutdown.
Fixes#4875
To prevent termination with SIGILL, tighten the instruction set
support checks. First, check for CLMUL too. Second, add a check in
scylla_prepare to catch the problem early.
Fixes#4921.
Scylla requires the CLMUL and SSE 4.2 instruction sets and will fail without them.
There is a check in main(), but that happens after the code is running and it may
already be too late. Add a check in scylla_prepare which runs before the main
executable.
"
It is well known that seastar applications, like Scylla, do not play
well with external processes: CPU usage from external processes may
confuse the I/O and CPU schedulers and create stalls.
We have also recently seen that memory usage from other application's
anonymous and page cache memory can bring the system to OOM.
Linux has a very good infrastructure for resource control contributed by
amazingly bright engineers in the form of cgroup controllers. This
infrastructure is exposed by SystemD in the form of slices: a
hierarchical structure to which controllers can be attached.
In true systemd way, the hierarchy is implicit in the filenames of the
slice files. a "-" symbol defines the hierarchy, so the files that this
patch presents, scylla-server and scylla-helper, essentially create a
"scylla" cgroup at the top level with "server" and "helper" children.
Later we mark the Services needed to run scylla as belonging to one
or the other through the Slice= directive.
Scylla DBAs can benefit from this setup by using the systemd-run
utility to fire ad-hoc commands.
Let's say for example that someone wants to hypothetically run a backup
and transfer files to an external object store like S3, making sure that
the amount of page cache used won't create swap pressure leading to
database timeouts.
One can then run something like:
sudo systemd-run --uid=id -u scylla --gid=id -g scylla -t --slice=scylla-helper.slice /path/to/my/magical_backup_tool
(or even better, the backup tool can itself be a systemd timer)
"
* 'slices' of https://github.com/glommer/scylla:
systemd: put scylla processes in systemd slices.
move postinst steps to an external script
"
The warning for discarded futures will only become useful, once we can
silence all present warnings and flip the flag to make it become error.
Then it will start being useful in finding new, accidental discarding of
futures.
This series silences all remaining warnings in the Scylla codebase. For
those cases where it was obvious that the future is discarded on
purpose, the author taking all necessary precaution (handling exception)
the warning was simply silenced by casting the future to void and
adding a relevant comment. Where the discarding seems to have been done
in error, I have fixed the code to not discard it. To the rest of the
sites I added a FIXME to fix the discarding.
"
* 'resolve-discarded-future-warnings/v4.2' of https://github.com/denesb/scylla:
treewide: silence discarded future warnings for questionable discards
treewide: silence discarded future warnings for legit discards
tests: silence discarded future warnings
tests/cql_query_test.cc: convert some tests to thread
This patches silences the remaining discarded future warnings, those
where it cannot be determined with reasonable confidence that this was
indeed the actual intent of the author, or that the discarding of the
future could lead to problems. For all those places a FIXME is added,
with the intent that these will be soon followed-up with an actual fix.
I deliberately haven't fixed any of these, even if the fix seems
trivial. It is too easy to overlook a bad fix mixed in with so many
mechanical changes.
This patch silences those future discard warnings where it is clear that
discarding the future was actually the intent of the original author,
*and* they did the necessary precautions (handling errors). The patch
also adds some trivial error handling (logging the error) in some
places, which were lacking this, but otherwise look ok. No functional
changes.
Some tests are currently discarding futures unjustifiably, however
adding code to wait on these futures is quite inconvenient due to the
continuation style code of these tests. Convert them to run in a seastar
thread to make the fix easier.
Introduced in c96ee98.
We call update_schema_version() after features are enabled and we
recalculate the schema version. This method is not updating gossip
though. The node will still use it's database::version() to decide on
syncing, so it will not sync and stay inconsistent in gossip until the
next schema change.
We should call updatE_schema_version_and_announce() instead so that
the gossip state is also updated.
There is no actual schema inconsistency, but the joining node will
think there is and will wait indefinitely. Making a random schema
change would unbock it.
Fixes#4647.
Message-Id: <1566825684-18000-1-git-send-email-tgrabiec@scylladb.com>
* seastar afc5bbf511...20bfd61955 (18):
> reactor: closing file used to check if direct_io is supported
> future: set_coroutine(): s/state()/_state/
> tests/perf/perf_test.hh: suppress discarded future warning
> tests: rpc: fix memory leak in timeout wraparound tests
> Revert "future-util: reduce allocations and continuations in parallel_for_each"
> reactor: fix rename_priority_class() build failure in C++14 mode
> future: mark future_state_base::failed() as unlikely
> future-util: reduce allocations and continuations in parallel_for_each
> future-utils: generalize when_all_estimate_vector_capacity()
> output_stream: Add comment on sequentiality
> docs/tutorial.md: minor cleanups in first section
> core: fix a race in execution stages (Fixes#4856, fixes#4766)
> semaphore: use semaphore's clock type in with_semaphore()/get_units()
> future: fix doxygen documentation for promise<>
> sharded: fixed detecting stop method when building with clang
> reactor: fixed locking error in rename_priority_class
> Assert that append_challenged_posix_file_impl are closed.
> rpc: correctly handle huge timeouts
Merged patch series from Amnon Heiman amnon@scylladb.com
This Patch adds an implementation of the get built index API and remove a
FIXME.
The API returns a list of secondary indexes belongs to a column family
and have already been fully built.
Example:
CREATE KEYSPACE scylla_demo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
CREATE TABLE scylla_demo.mytableID ( uid uuid, text text, time timeuuid, PRIMARY KEY (uid, time) );
CREATE index on scylla_demo.mytableID (time);
$ curl -X GET 'http://localhost:10000/column_family/built_indexes/scylla_demo%3Amytableid'
["mytableid_time_idx"]
The sum_ratio struct is a helper struct that is used when calculating
ratio over multiple shards.
Originally it was created thinking that it may need to use future, in
practice it was never used and the future was ignore.
This patch remove the future from the implementation and reduce an
unhandle future warning from the compilation.
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
This Patch adds an implementation of the get build index API and remove a
FIXME.
The API returns the list of the built secondary indexes belongs to a column family.
Example:
CREATE KEYSPACE scylla_demo WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
CREATE TABLE scylla_demo.mytableID ( uid uuid, text text, time timeuuid, PRIMARY KEY (uid, time) );
CREATE index on scylla_demo.mytableID (time);
$ curl -X GET 'http://localhost:10000/column_family/built_indexes/scylla_demo%3Amytableid'
["mytableid_time_idx"]
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
When a role is created through the `create role` statement, the
'is_superuser' and 'can_login' columns are set to false by default.
Likewise, `list roles`, `alter roles` and `* roles` operations
expect to find a boolean when reading the same columns.
This is not the case, though, when a user directly inserts to
`system_auth.roles` and doesn't set those columns. Even though
manually creating roles is not a desired day-to-day operation,
it is an insert just like any other and it should work.
`* roles` operations, on the other hand, are not prepared for
this deviations. If a user manually creates a role and doesn't
set boolean values to those columns, `* roles` will return all
sorts of errors. This happens because `* roles` is explicitly
expecting a boolean and casting for it.
This patch makes `* roles` more friendly by considering the
boolean variable `false` - inside `* roles` context - if the
actual value is `null`; it won't change the `null` value.
Fixes#4280
Signed-off-by: Juliana Oliveira <juliana@scylladb.com>
Message-Id: <20190816032617.61680-1-juliana@scylladb.com>
The scylla_blocktune.py has a FIXME for btrfs from 2016, which is no
longer relevant for Scylla deployments, as Red Hat dropped support for
the file system in 2017.
Message-Id: <20190823114013.31112-1-penberg@scylladb.com>
The priority class the shard reader was created with was hardcoded to be
`service::get_local_sstable_query_read_priority()`. At the time this
code was written, priority classes could not be passed to other shards,
so this method, receiving its priority class parameters from another
shard, could not use it. This is now fixed, so we can just use whatever
the caller wants us to use.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190823115111.68711-1-bdenes@scylladb.com>
Cartesian products (generated by IN restrictions) can grow very large,
even for short queries. This can overwhelm server resources.
Add limit checking for cartesian products, and configuration items for
users that are not satisfied with the default of 100 records fetched.
Fixes#4752.
Tests: unit (dev), manual test with SIGHUP.
Cartesian products (via IN restrictions) make it easy to generate huge
primary key sets with simple queries, overflowing server resources. Limit
them in the coordinator and report an exception instead of trying to
execute a query that would consume all of our memory.
A unit test is added.
We need a way to configure the cql interpreter and runtime. So far we relied
on accessing the configuration class via various backdoors, but that causes
its own problems around initialization order and testability. To avoid that,
this patch adds an empty cql_config class and propagates it from main.cc
(and from tests) to the cql interpreter via the query_options class, which is
already passed everywhere.
Later patches will fill it with contents.
This was broken since the type refactoring. It was checking the static
type, which is always abstract_type. Unfortunately we only had dtests
for this.
This can probably be optimized to avoid the double switch over kind,
but it is probably better to do the simple fix first.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20190821155354.47704-1-espindola@scylladb.com>
Currently we create a regex from the LIKE pattern for every row
considered during filtering, even though the pattern is always the
same. This is wasteful, especially since we require costly
optimization in the regex compiler. Fix it by reusing the regex
whenever the pattern is unchanged since the last call.
Tests: unit (dev)
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
The loop over view update handlers used a guard in order to ensure
that the object is not prematurely destroyed (thus invalidating
the iterator), but the guard itself was not in the right scope.
Fixed by replacinga 'for' loop with a 'while' loop, which moves
the iterator incrementation inside the scope in which it's still
guarded and valid.
Fixes#4866
Currently, seastar is built in seastar/build/{mode}. This means we have two build
directories: build/{mode} and seastar/build/{mode}.
This patch changes that to have only a single build directory (build/{mode}). It
does that by calling Seastar's cmake directly instead of through Seastar's
./configure.py. However, to support dpdk, if that is enabled it calls cmake
through Seastar's ./cooking.sh (similar to what Seastar's ./configure.py does).
All ./configure.py flags are translated to cmake variables, in the same way that
Seastar does.
Contains fix from Rafael to pass the flags for the correct mode.
This clarifies that "rows" are clustering rows and that there is no
information about individual collection elements.
The patch also documents some properties common to all these tables.
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20190820171204.48739-1-espindola@scylladb.com>
The endpoint directories scanned by space_watchdog may get deleted
by the manager::drain_for().
If a deleted directory is given to a lister::scan_dir() this will end up
in an exception and as a result a space_watchdog will skip this round
and hinted handoff is going to be disabled (for all agents including MVs)
for the whole space_watchdog round.
Let's make sure this doesn't happen by serializing the scanning and deletion
using end_point_hints_manager::file_update_mutex.
Signed-off-by: Vlad Zolotarov <vladz@scylladb.com>
end_point_hints_manager::file_update_mutex is taken by space_watchdog
but while space_watchdog is waiting for it the corresponding
end_point_hints_manager instance may get destroyed by manager::drain_for()
or by manager::stop().
This will end up in a use-after-free event.
Let's change the end_point_hints_manager's API in a way that would prevent
such an unsafe locking:
- Introduce the with_file_update_mutex().
- Make end_point_hints_manager::file_update_mutex() method private.
Fixes#4685Fixes#4836
Signed-off-by: Vlad Zolotarov <vladz@scylladb.com>
If drain_for() is running together with itself: one instance for the local
node and one for some other node, erasing of elements from the _ep_managers
map may lead to a use-after-free event.
Let's serialize drain_for() calls with a semaphore.
Signed-off-by: Vlad Zolotarov <vladz@scylladb.com>
Our current relocation works by invoking the dynamic linker with the
executable as an argument. This confuses gdb since the kernel records
the dynamic linker as the executable, not the real executable.
Switch to install-time relocation with patchelf: when installing the
executable and libraries, all paths are known, and we can update the
path to the dynamic loader and to the dynamic libraries.
Since patchelf itself is dynamically linked, we have to relocate it
dynamically (with the old method of invoking it via the dynamic linker).
This is okay since it's a one-time operation and since we don't expect
to debug core dumps of patchelf crashes.
We lose the ability to run scylla directly from the uninstalled
tarball, but since the nonroot installer is already moving in the
direction of requiring install.sh, that is not a great loss, and
certainly the ability to debug is more important.
dh_strip barfs on some binaries which were treated with patchelf,
so exclude them from dh_strip. This doesn't lose any functionality,
since these binaries didn't have debug information to begin with
(they are already-stripped Fedora executables).
Fixes#4673.
Non-full prefix keys are currently not handled correctly as all keys
are treated as if they were full prefixes, and therefore they represent
a point in the key space. Non-full prefixes however represent a
sub-range of the key space and therefore require null extending before
they can be treated as a point.
As a quick reminder, `key` is used to trim the clustering ranges such
that they only cover positions >= then key. Thus,
`trim_clustering_row_ranges_to()` does the equivalent of intersecting
each range with (key, inf). When `key` is a prefix, this would exclude
all positions that are prefixed by key as well, which is not desired.
Fixes: #4839
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20190819134950.33406-1-bdenes@scylladb.com>
"
Follow-up to #4610, where a review comment asked for test coverage on all types. Existing tests cover all the types admissible in LIKE, while this PR adds coverage for all inadmissible types.
Tests: unit (dev)
"
* 'like-nonstring' of https://github.com/dekimir/scylla:
cql_query_test: Add LIKE tests for all types
cql_query_test: Remove LIKE-nonstring-pattern case
cql_query_test: Move a testcase elsewhere in file
In b197924, we changed the shutdown process not to rely on the global
reactor-defined exit, but instead added a local variable to hold the
shutdown state. However, we did not propagate that state everywhere,
and now streaming processes are not able to abort.
Fix that by enhancing stop_signal with a sharded<abort_source> member
that can be propagated to services. Propagate it to storage_service
and thence to boot_strapper and range_streamer so that streaming
processes can be aborted.
Fixes#4674Fixes#4501
Tests: unit (dev), manual bootstrap test
"
Streamed view updates parasitized on writing io priority, which is
reserved for user writes - it's now properly bound to streaming
write priority.
Verified manually by checking appropriate io metrics: scylla_io_queue_total_bytes{class="streaming_write" ...} vs scylla_io_queue_total_bytes{class="query" ...}
Tests: unit(dev)
"
* 'assign_proper_io_priority_to_streaming_view_updates' of https://github.com/psarna/scylla:
db,view: wrap view update generation in stream scheduling group
database: assign proper io priority for streaming view updates
Our current relocation works by invoking the dynamic linker with the
executable as an argument. This confuses gdb since the kernel records
the dynamic linker as the executable, not the real executable.
Switch to install-time relocation with patchelf: when installing the
executable and libraries, all paths are known, and we can update the
path to the dynamic loader and to the dynamic libraries.
Since patchelf itself is dynamically linked, we have to relocate it
dynamically (with the old method of invoking it via the dynamic linker).
This is okay since it's a one-time operation and since we don't expect
to debug core dumps of patchelf crashes.
We lose the ability to run scylla directly from the uninstalled
tarball, but since the nonroot installer is already moving in the
direction of requiring install.sh, that is not a great loss, and
certainly the ability to debug is more important.
dh_strip barfs on some binaries which were treated with patchelf,
so exclude them from dh_strip. This doesn't lose any functionality,
since these binaries didn't have debug information to begin with
(they are already-stripped Fedora executables).
Fixes#4673.
It is well known that seastar applications, like Scylla, do not play
well with external processes: CPU usage from external processes may
confuse the I/O and CPU schedulers and create stalls.
We have also recently seen that memory usage from other application's
anonymous and page cache memory can bring the system to OOM.
Linux has a very good infrastructure for resource control contributed by
amazingly bright engineers in the form of cgroup controllers. This
infrastructure is exposed by SystemD in the form of slices: a
hierarchical structure to which controllers can be attached.
In true systemd way, the hierarchy is implicit in the filenames of the
slice files. a "-" symbol defines the hierarchy, so the files that this
patch presents, scylla-server and scylla-helper, essentially create a
"scylla" cgroup at the top level with "server" and "helper" children.
Later we mark the Services needed to run scylla as belonging to one
or the other through the Slice= directive.
Scylla DBAs can benefit from this setup by using the systemd-run
utility to fire ad-hoc commands.
Let's say for example that someone wants to hypothetically run a backup
and transfer files to an external object store like S3, making sure that
the amount of page cache used won't create swap pressure leading to
database timeouts.
One can then run something like:
```
sudo systemd-run --uid=`id -u scylla` --gid=`id -g scylla` -t --slice=scylla-helper.slice /path/to/my/magical_backup_tool
```
(or even better, the backup tool can itself be a systemd timer)
Changes from last version:
- No longer use the CPUQuota
- Minor typo fixes
- postinstall fixup for small machines
Benchmark results:
==================
Test: read from disk, with 100% disk util using a single i3.xlarge (4 vCPUs).
We have to fill the cache as we read, so this should stress CPU, memory and
disk I/O.
cassandra-stress command:
```
cassandra-stress read no-warmup duration=5m -rate threads=20 -node 10.2.209.188 -pop dist=uniform\(1..150000000\)
```
Baseline results:
```
Results:
Op rate : 13,830 op/s [READ: 13,830 op/s]
Partition rate : 13,830 pk/s [READ: 13,830 pk/s]
Row rate : 13,830 row/s [READ: 13,830 row/s]
Latency mean : 1.4 ms [READ: 1.4 ms]
Latency median : 1.4 ms [READ: 1.4 ms]
Latency 95th percentile : 2.4 ms [READ: 2.4 ms]
Latency 99th percentile : 2.8 ms [READ: 2.8 ms]
Latency 99.9th percentile : 3.4 ms [READ: 3.4 ms]
Latency max : 12.0 ms [READ: 12.0 ms]
Total partitions : 4,149,130 [READ: 4,149,130]
Total errors : 0 [READ: 0]
Total GC count : 0
Total GC memory : 0.000 KiB
Total GC time : 0.0 seconds
Avg GC time : NaN ms
StdDev GC time : 0.0 ms
Total operation time : 00:05:00
```
Question 1:
===========
Does putting scylla in a special slice affect its performance ?
Results with Scylla running in a slice:
```
Results:
Op rate : 13,811 op/s [READ: 13,811 op/s]
Partition rate : 13,811 pk/s [READ: 13,811 pk/s]
Row rate : 13,811 row/s [READ: 13,811 row/s]
Latency mean : 1.4 ms [READ: 1.4 ms]
Latency median : 1.4 ms [READ: 1.4 ms]
Latency 95th percentile : 2.2 ms [READ: 2.2 ms]
Latency 99th percentile : 2.6 ms [READ: 2.6 ms]
Latency 99.9th percentile : 3.3 ms [READ: 3.3 ms]
Latency max : 23.2 ms [READ: 23.2 ms]
Total partitions : 4,151,409 [READ: 4,151,409]
Total errors : 0 [READ: 0]
Total GC count : 0
Total GC memory : 0.000 KiB
Total GC time : 0.0 seconds
Avg GC time : NaN ms
StdDev GC time : 0.0 ms
Total operation time : 00:05:00
```
*Conclusion* : No significant change
Question 2:
===========
What happens when there is a CPU hog running in the same server as scylla?
CPU hog:
```
taskset -c 0 /bin/sh -c "while true; do true; done" &
taskset -c 1 /bin/sh -c "while true; do true; done" &
taskset -c 2 /bin/sh -c "while true; do true; done" &
taskset -c 3 /bin/sh -c "while true; do true; done" &
sleep 330
```
Scenario 1: CPU hog runs freely:
```
Results:
Op rate : 2,939 op/s [READ: 2,939 op/s]
Partition rate : 2,939 pk/s [READ: 2,939 pk/s]
Row rate : 2,939 row/s [READ: 2,939 row/s]
Latency mean : 6.8 ms [READ: 6.8 ms]
Latency median : 5.3 ms [READ: 5.3 ms]
Latency 95th percentile : 11.0 ms [READ: 11.0 ms]
Latency 99th percentile : 14.9 ms [READ: 14.9 ms]
Latency 99.9th percentile : 17.1 ms [READ: 17.1 ms]
Latency max : 26.3 ms [READ: 26.3 ms]
Total partitions : 884,460 [READ: 884,460]
Total errors : 0 [READ: 0]
Total GC count : 0
Total GC memory : 0.000 KiB
Total GC time : 0.0 seconds
Avg GC time : NaN ms
StdDev GC time : 0.0 ms
Total operation time : 00:05:00
```
Scenario 2: CPU hog runs inside scylla-helper slice
```
Results:
Op rate : 13,527 op/s [READ: 13,527 op/s]
Partition rate : 13,527 pk/s [READ: 13,527 pk/s]
Row rate : 13,527 row/s [READ: 13,527 row/s]
Latency mean : 1.5 ms [READ: 1.5 ms]
Latency median : 1.4 ms [READ: 1.4 ms]
Latency 95th percentile : 2.4 ms [READ: 2.4 ms]
Latency 99th percentile : 2.9 ms [READ: 2.9 ms]
Latency 99.9th percentile : 3.8 ms [READ: 3.8 ms]
Latency max : 18.7 ms [READ: 18.7 ms]
Total partitions : 4,069,934 [READ: 4,069,934]
Total errors : 0 [READ: 0]
Total GC count : 0
Total GC memory : 0.000 KiB
Total GC time : 0.0 seconds
Avg GC time : NaN ms
StdDev GC time : 0.0 ms
Total operation time : 00:05:00
```
*Conclusion*: With systemd slice we can keep the performance very close to
baseline
Question 3:
===========
What happens when there is a CPU hog running in the same server as scylla?
I/O hog: (Data in the cluster is 2x size of memory)
```
while true; do
find /var/lib/scylla/data -type f -exec grep glauber {} +
done
```
Scenario 1: I/O hog runs freely:
```
Results:
Op rate : 7,680 op/s [READ: 7,680 op/s]
Partition rate : 7,680 pk/s [READ: 7,680 pk/s]
Row rate : 7,680 row/s [READ: 7,680 row/s]
Latency mean : 2.6 ms [READ: 2.6 ms]
Latency median : 1.3 ms [READ: 1.3 ms]
Latency 95th percentile : 7.8 ms [READ: 7.8 ms]
Latency 99th percentile : 10.9 ms [READ: 10.9 ms]
Latency 99.9th percentile : 16.9 ms [READ: 16.9 ms]
Latency max : 40.8 ms [READ: 40.8 ms]
Total partitions : 2,306,723 [READ: 2,306,723]
Total errors : 0 [READ: 0]
Total GC count : 0
Total GC memory : 0.000 KiB
Total GC time : 0.0 seconds
Avg GC time : NaN ms
StdDev GC time : 0.0 ms
Total operation time : 00:05:00
```
Scenario 2: I/O hog runs in the scylla-helper systemd slice:
```
Results:
Op rate : 13,277 op/s [READ: 13,277 op/s]
Partition rate : 13,277 pk/s [READ: 13,277 pk/s]
Row rate : 13,277 row/s [READ: 13,277 row/s]
Latency mean : 1.5 ms [READ: 1.5 ms]
Latency median : 1.4 ms [READ: 1.4 ms]
Latency 95th percentile : 2.4 ms [READ: 2.4 ms]
Latency 99th percentile : 2.9 ms [READ: 2.9 ms]
Latency 99.9th percentile : 3.5 ms [READ: 3.5 ms]
Latency max : 183.4 ms [READ: 183.4 ms]
Total partitions : 3,984,080 [READ: 3,984,080]
Total errors : 0 [READ: 0]
Total GC count : 0
Total GC memory : 0.000 KiB
Total GC time : 0.0 seconds
Avg GC time : NaN ms
StdDev GC time : 0.0 ms
Total operation time : 00:05:00
```
*Conclusion*: With systemd slice we can keep the performance very close to
baseline
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Propagate the abort_source from main() into boot_strapper and range_stream and
check for aborts at strategic points. This includes aborting running stream_plans
and aborting sleeps between retries.
Fixes#4674
In order to propagate stop signals, expose them as sharded<abort_source>. This
allows propagating the signal to all shards, and integrating it with
sleep_abortable().
Because sharded<abort_source>::stop() will block, we'll now require stop_signal
to run in a thread (which is already the case).
This testcase was previously commented out, pending a fix that cannot
be made. Currently it is impossible to validate the marker-value type
at filtering time. The value is entered into the options object under
its presumed type of string, regardless of what it was made from.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
Somehow this test case sits in the middle of LIKE-operator tests:
test_alter_type_on_compact_storage_with_no_regular_columns_does_not_crash
Move it so LIKE test cases are contiguous.
Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
There are systemd-related steps done in both rpm and deb builds.
Move that to a script so we avoid duplication.
The tests are so far a bit specific to the distributions, so it
needs to be adapted a bit.
Also note that this also fixes a bug with rpm as a side-effect:
rpm does not call daemon-reload after potentially changing the
systemd files (it is only implied during postun operations, that
happen during uninstall). daemon-reload was called explicitly for
debian packages, and now it is called for both.
Signed-off-by: Glauber Costa <glauber@scylladb.com>
This patch fixes a bug that a map is held on the stack and then is used
by a future.
Instead, the map is now wrapped with do_with.
Fixes#4824
Signed-off-by: Amnon Heiman <amnon@scylladb.com>
2019-08-12 14:04:00 +03:00
5654 changed files with 109309 additions and 40201 deletions
- `--{enable,disable}-dpdk`: [DPDK](http://dpdk.org/) is a set of libraries and drivers for fast packet processing. During development, it's not necessary to enable support even if it is supported by your platform.
- `--enable-dpdk`: [DPDK](http://dpdk.org/) is a set of libraries and drivers for fast packet processing. During development, it's not necessary to enable support even if it is supported by your platform.
Source files and build targets are tracked manually in `configure.py`, so the script needs to be updated when new files or targets are added or removed.
@@ -141,7 +141,7 @@ In v3:
"Tests: unit ({mode}), dtest ({smp})"
```
The usual is "Tests: unit (release)", although running debug tests is encouraged.
The usual is "Tests: unit (dev)", although running debug tests is encouraged.
5. When answering review comments, prefer inline quotes as they make it easier to track the conversation across multiple e-mails.
seastar::metrics::description("number of operations via Alternator API"),{op(CamelCaseName)}),
#define OPERATION_LATENCY(name, CamelCaseName) \
seastar::metrics::make_histogram("op_latency", \
seastar::metrics::description("Latency histogram of an operation via Alternator API"),{op(CamelCaseName)},[this]{returnapi_operations.name.get_histogram(1,20);}),
"summary":"Return the generation value for this node.",
"type":"int",
"type":"long",
"nickname":"get_current_generation_number",
"produces":[
"application/json"
@@ -582,7 +582,15 @@
},
{
"name":"kn",
"description":"Comma seperated keyspaces name to snapshot",
"description":"Comma seperated keyspaces name that their snapshot will be deleted",
"required":false,
"allowMultiple":false,
"type":"string",
"paramType":"query"
},
{
"name":"cf",
"description":"an optional table name that its snapshot will be deleted",
"required":false,
"allowMultiple":false,
"type":"string",
@@ -646,7 +654,7 @@
{
"method":"POST",
"summary":"Trigger a cleanup of keys on a single keyspace",
"type":"int",
"type":"long",
"nickname":"force_keyspace_cleanup",
"produces":[
"application/json"
@@ -678,7 +686,7 @@
{
"method":"GET",
"summary":"Scrub (deserialize + reserialize at the latest version, skipping bad rows if any) the given keyspace. If columnFamilies array is empty, all CFs are scrubbed. Scrubbed CFs will be snapshotted first, if disableSnapshot is false",
"type":"int",
"type":"long",
"nickname":"scrub",
"produces":[
"application/json"
@@ -726,7 +734,7 @@
{
"method":"GET",
"summary":"Rewrite all sstables to the latest version. Unlike scrub, it doesn't skip bad rows and do not snapshot sstables first.",
"type":"int",
"type":"long",
"nickname":"upgrade_sstables",
"produces":[
"application/json"
@@ -800,7 +808,7 @@
"summary":"Return an array with the ids of the currently active repairs",
"type":"array",
"items":{
"type":"int"
"type":"long"
},
"nickname":"get_active_repair_async",
"produces":[
@@ -816,7 +824,7 @@
{
"method":"POST",
"summary":"Invoke repair asynchronously. You can track repair progress by using the get supplying id",
"type":"int",
"type":"long",
"nickname":"repair_async",
"produces":[
"application/json"
@@ -947,7 +955,7 @@
"description":"The repair ID to check for status",
Some files were not shown because too many files have changed in this diff
Show More
Reference in New Issue
Block a user
Blocking a user prevents them from interacting with repositories, such as opening or commenting on pull requests or issues. Learn more about blocking a user.