Compare commits

...

171 Commits

Author SHA1 Message Date
Hagit Segev
b0f656302c release: prepare for 4.1.11 2021-01-05 10:13:34 +02:00
Benny Halevy
e05e7b2a98 compaction: compaction_writer: destroy shared_sstable after the sstable_writer
sstable_writer may depend on the sstable throughout its whole lifecycle.
If the sstable is freed before the sstable_writer we might hit use-after-free
as in the follwing case:
```
std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator+=(long) at /usr/include/c++/10/bits/stl_deque.h:240
 (inlined by) std::operator+(std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*> const&, long) at /usr/include/c++/10/bits/stl_deque.h:378
 (inlined by) std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator[](long) const at /usr/include/c++/10/bits/stl_deque.h:252
 (inlined by) std::deque<sstables::compression::segmented_offsets::bucket, std::allocator<sstables::compression::segmented_offsets::bucket> >::operator[](unsigned long) at /usr/include/c++/10/bits/stl_deque.h:1327
 (inlined by) sstables::compression::segmented_offsets::push_back(unsigned long, sstables::compression::segmented_offsets::state&) at ./sstables/compress.cc:214
sstables::compression::segmented_offsets::writer::push_back(unsigned long) at ./sstables/compress.hh:123
 (inlined by) compressed_file_data_sink_impl<crc32_utils, (compressed_checksum_mode)1>::put(seastar::temporary_buffer<char>) at ./sstables/compress.cc:519
seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at table.cc:?
 (inlined by) seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at ././seastar/include/seastar/core/iostream-impl.hh:432
seastar::output_stream<char>::flush() at table.cc:?
seastar::output_stream<char>::close() at table.cc:?
sstables::file_writer::close() at sstables.cc:?
sstables::mc::writer::~writer() at writer.cc:?
 (inlined by) sstables::mc::writer::~writer() at ./sstables/mx/writer.cc:790
sstables::mc::writer::~writer() at writer.cc:?
flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at compaction.cc:?
 (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_destroy() at /usr/include/c++/10/optional:260
 (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_reset() at /usr/include/c++/10/optional:280
 (inlined by) std::_Optional_payload<sstables::compaction_writer, false, false, false>::~_Optional_payload() at /usr/include/c++/10/optional:401
 (inlined by) std::_Optional_base<sstables::compaction_writer, false, false>::~_Optional_base() at /usr/include/c++/10/optional:474
 (inlined by) std::optional<sstables::compaction_writer>::~optional() at /usr/include/c++/10/optional:659
 (inlined by) sstables::compacting_sstable_writer::~compacting_sstable_writer() at ./sstables/compaction.cc:229
 (inlined by) compact_mutation<(emit_only_live_rows)0, (compact_for_sstables)1, sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_mutation() at ././mutation_compactor.hh:468
 (inlined by) compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_for_compaction() at ././mutation_compactor.hh:538
 (inlined by) std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::operator()(compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>*) const at /usr/include/c++/10/bits/unique_ptr.h:85
 (inlined by) std::unique_ptr<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>, std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~unique_ptr() at /usr/include/c++/10/bits/unique_ptr.h:361
 (inlined by) stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::~stable_flattened_mutations_consumer() at ././mutation_reader.hh:342
 (inlined by) flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at ././flat_mutation_reader.hh:201
auto flat_mutation_reader::impl::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:272
 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:383
 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:389
 (inlined by) seastar::future<void> sstables::compaction::setup<noop_compacted_fragments_consumer>(noop_compacted_fragments_consumer)::{lambda(flat_mutation_reader)#1}::operator()(flat_mutation_reader)::{lambda()#1}::operator()() at ./sstables/compaction.cc:612
```

What happens here is that:

    compressed_file_data_sink_impl(output_stream<char> out, sstables::compression* cm, sstables::local_compression lc)
            : _out(std::move(out))
            , _compression_metadata(cm)
            , _offsets(_compression_metadata->offsets.get_writer())
            , _compression(lc)
            , _full_checksum(ChecksumType::init_checksum())

_compression_metadata points to a buffer held by the sstable object.
and _compression_metadata->offsets.get_writer returns a writer that keeps
a reference to the segmented_offsets in the sstables::compression
that is used in the ~writer -> close path.

Fixes #7821

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20201227145726.33319-1-bhalevy@scylladb.com>
(cherry picked from commit 8a745a0ee0)
2021-01-04 15:12:33 +02:00
Avi Kivity
ae0f3ef543 Revert "Merge 'Move temporaries to value view' from Piotr S"
This reverts commit b34a1d9576. It causes
regressions in processing of bind variables.

Fixes #7761.
2020-12-24 12:42:42 +02:00
Gleb Natapov
2a6a072857 mutation_writer: pass exceptions through feed_writer
feed_writer() eats exception and transforms it into an end of stream
instead. Downstream validators hate when this happens.

Fixes #7482
Message-Id: <20201216090038.GB3244976@scylladb.com>

(cherry picked from commit 61520a33d6)
2020-12-16 17:20:32 +02:00
Aleksandr Bykov
da1a5b6542 dist: scylla_util: fix aws_instance.ebs_disks method
aws_instance.ebs_disks() method should return ebs disk
instead of ephemeral

Signed-off-by: Aleksandr Bykov <alex.bykov@scylladb.com>

Closes #7780

(cherry picked from commit e74dc311e7)
2020-12-16 11:59:12 +02:00
Avi Kivity
b85aa0e8a6 Update seastar submodule
* seastar 9d8d82a095...6fb1399ba1 (1):
  > sharded: Do not hang on never set freed promise

Fixes #6606.
2020-12-15 16:52:38 +02:00
Calle Wilund
8ccdd5c50f token_metadata: Prune empty racks on endpoint change
Fixes #6459

When moving or removing endpoints, we should ensure
that the set of available racks reflect the nodes
known, i.e. match what would be the result of a
reboot + create sets initially.
Message-Id: <20200519153300.15391-1-calle@scylladb.com>

(cherry picked from commit 7ce4a8b458)
2020-12-15 16:31:46 +02:00
Takuya ASADA
f7ffea4638 node_exporter_install: stop service before force installing
Stop node-exporter.service before re-install it, to avoid 'Text file busy' error.

Fixes #6782

(cherry picked from commit ef05ea8e91)
2020-12-15 16:28:36 +02:00
Avi Kivity
fb40e375bf dist: rpm: uninstall tuned when installing scylla-kernel-conf
tuned 2.11.0-9 and later writes to kerned.sched_wakeup_granularity_ns
and other sysctl tunables that we so laboriously tuned, dropping
performance by a factor of 5 (due to increased latency). Fix by
obsoleting tuned during install (in effect, we are a better tuned,
at least for us).

Not needed for .deb, since debian/ubunto do not install tuned by
default.

Fixes #7696

Closes #7776

(cherry picked from commit 615b8e8184)
2020-12-12 14:32:59 +02:00
Eliran Sinvani
9ea2a61d63 consistency level: fix wrong quorum calculation whe RF = 0
We used to calculate the number of endpoints for quorum and local_quorum
unconditionally as ((rf / 2) + 1). This formula doesn't take into
account the corner case where RF = 0, in this situation quorum should
also be 0.
This commit adds the missing corner case.

Tests: Unit Tests (dev)
Fixes #6905

Closes #7296

(cherry picked from commit 925cdc9ae1)
2020-11-29 16:45:26 +02:00
Avi Kivity
6898fcd40f Update seastar submodule for precalculated TLS DH parameters
* seastar d4df4fa6de...9d8d82a095 (1):
  > TLS: Use "known" (precalculated) DH parameters if available

Fixes #6191.
2020-11-29 14:36:40 +02:00
Asias He
4df08e331b repair: Make repair_writer a shared pointer
The future of the fiber that writes data into sstables inside
the repair_writer is stored in _writer_done like below:

class repair_writer {
   _writer_done[node_idx] =
      mutation_writer::distribute_reader_and_consume_on_shards().then([this] {
         ...
      }).handle_exception([this] {
         ...
      });
}

The fiber access repair_writer object in the error handling path. We
wait for the _writer_done to finish before we destroy repair_meta
object which contains the repair_writer object to avoid the fiber
accessing already freed repair_writer object.

To be safer, we can make repair_writer a shared pointer and take a
reference in the distribute_reader_and_consume_on_shards code path.

Fixes #7406

Closes #7430

(cherry picked from commit 289a08072a)
2020-11-29 13:30:06 +02:00
Pavel Emelyanov
7b1fb86a28 query_pager: Fix continuation handling for noop visitor
Before updating the _last_[cp]key (for subsequent .fetch_page())
the pager checks is 'if the pager is not exhausted OR the result
has data'.

The check seems broken: if the pager is not exhausted, but the
result is empty the call for keys will unconditionally try to
reference the last element from empty vector. The not exhausted
condition for empty result can happen if the short_read is set,
which, in turn, unconditionally happens upon meeting partition
end when visiting the partition with result builder.

The correct check should be 'if the pager is not exhausted AND
the result has data': the _last_[pc]key-s should be taken for
continuation (not exhausted), but can be taken if the result is
not empty (has data).

fixes: #7263
tests: unit(dev), but tests don't trigger this corner case

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200921124329.21209-1-xemul@scylladb.com>
(cherry picked from commit 550fc734d9)
2020-11-29 12:01:43 +02:00
Takuya ASADA
f7be22ccb2 install.sh: set PATH for relocatable CLI tools in python thunk
We currently set PATH for relocatable CLI tools in scylla_util.run() and
scylla_util.out(), but it doesn't work for perftune.py, since it's not part of
Scylla, does not use scylla_util module.
We can set PATH in python thunk instead, it can set PATH for all python scripts.

Fixes #7350

(cherry picked from commit 5867af4edd)
2020-11-29 11:54:53 +02:00
Bentsi Magidovich
26b5a34f96 scylla_util.py: fix exception handling in curl
Retry mechanism didn't work when URLError happend. For example:

  urllib.error.URLError: <urlopen error [Errno 101] Network is unreachable>

Let's catch URLError instead of HTTP since URLError is a base exception
for all exceptions in the urllib module.

Fixes: #7569

Closes #7567

(cherry picked from commit 956b97b2a8)
2020-11-29 11:48:42 +02:00
Takuya ASADA
10a65ba2fb dist/redhat: packaging dependencies.conf as normal file, not ghost
When we introduced dependencies.conf, we mistakenly added it on rpm as %ghost,
but it should be normal file, should be installed normally on package installation.

Fixes #7703

Closes #7704

(cherry picked from commit ba4d54efa3)
2020-11-29 11:40:27 +02:00
Takuya ASADA
be60e3ca52 install.sh: apply sysctl.d files on non-packaging installation
We don't apply sysctl.d files on non-packaging installation, apply them
just like rpm/deb taking care of that.

Fixes #7702

Closes #7705

(cherry picked from commit 5f81f97773)
2020-11-29 11:35:51 +02:00
Avi Kivity
5485c902fe dist: sysctl: configure more inotify instances
Since f3bcd4d205 ("Merge 'Support SSL Certificate Hot
Reloading' from Calle"), we reload certificates as they are
modified on disk. This uses inotify, which is limited by a
sysctl fs.inotify.max_user_instances, with a default of 128.

This is enough for 64 shards only, if both rpc and cql are
encrypted; above that startup fails.

Increase to 1200, which is enough for 6 instances * 200 shards.

Fixes #7700.

Closes #7701

(cherry picked from commit 390e07d591)
2020-11-29 11:04:57 +02:00
Hagit Segev
01c822301f release: prepare for 4.1.10 2020-11-19 18:07:49 +02:00
Raphael S. Carvalho
415b271a39 compaction: Make sure a partition is filtered out only by producer
If interposer consumer is enabled, partition filtering will be done by the
consumer instead, but that's not possible because only the producer is able
to skip to the next partition if the current one is filtered out, so scylla
crashes when that happens with a bad function call in queue_reader.
This is a regression which started here: 55a8b6e3c9

To fix this problem, let's make sure that partition filtering will only
happen on the producer side.

Fixes #7590.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20201111221513.312283-1-raphaelsc@scylladb.com>
(cherry picked from commit 13fa2bec4c)
2020-11-19 14:08:47 +02:00
Piotr Dulikowski
b7274ab44a hints: don't read hint files when it's not allowed to send
When there are hint files to be sent and the target endpoint is DOWN,
end_point_hints_manager works in the following loop:

- It reads the first hint file in the queue,
- For each hint in the file it decides that it won't be sent because the
  target endpoint is DOWN,
- After realizing that there are some unsent hints, it decides to retry
  this operation after sleeping 1 second.

This causes the first segment to be wholly read over and over again,
with 1 second pauses, until the target endpoint becomes UP or leaves the
cluster. This causes unnecessary I/O load in the streaming scheduling
group.

This patch adds a check which prevents end_point_hints_manager from
reading the first hint file at all when it is not allowed to send hints.

First observed in #6964

Tests:
- unit(dev)
- hinted handoff dtests

Closes #7407

(cherry picked from commit 77a0f1a153)
2020-11-16 14:30:26 +02:00
Botond Dénes
b144b93cd8 mutation_reader: queue_reader: don't set EOS flag on abort
If the consumer happens to check the EOS flag before it hits the
exception injected by the abort (by calling fill_buffer()), they can
think the stream ended normally and expect it to be valid. However this
is not guaranteed when the reader is aborted. To avoid consumers falsely
thinking the stream ended normally, don't set the EOS flag on abort at
all.

Additionally make sure the producer is aborted too on abort. In theory
this is not needed as they are the one initiating the abort, but better
to be safe then sorry.

Fixes: #7411
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20201102100732.35132-1-bdenes@scylladb.com>
(cherry picked from commit f5323b29d9)
2020-11-15 11:08:07 +02:00
Botond Dénes
7325996510 types: validate(): linearize values lazily
Instead of eagerly linearizing all values as they are passed to
validate(), defer linearization to those validators that actually need
linearized values. Linearizing large values puts pressure on the memory
allocator with large contiguous allocation requests. This is something
we are trying to actively avoid, especially if it is not really neaded.
Turns out the types, whose validators really want linearized values are
a minority, as most validators just look at the size of the value, and
some like bytes don't need validation at all, while usually having large
values.

This is achieved by templating the validator struct on the view and
using the FragmentedRange concept to treat all passed in views
(`bytes_view` and `fragmented_temporary_buffer_view`) uniformly.
This patch makes no attempt at converting existing validators to work
with fragmented buffers, only trivial cases are converted. The major
offenders still left are ascii/utf8 and collections.

Fixes: #7318

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20201007054524.909420-1-bdenes@scylladb.com>
(cherry picked from commit db56ae695c)

[avi: squashed ed6775c585 ("types: adjust
      validation_visitor construction for clang") as gcc 9 in scylla 4.1
      suffers from the same problem as clang 11]
2020-11-11 12:31:36 +02:00
Piotr Sarna
fb14fae79b Merge 'Backport PR #7469 to 4.2' from Eliran Sinvani
This is a backport of PR #7469 that did not apply cleanly to 4.2 with a trivial conflict, another commit that touched one of the files but in a completely different region.

Closes #7480

* github.com:scylladb/scylla:
  materialized views: add a base table reference if missing
  view info: support partial match between base and view for only reading from view.
  view info: guard against null dereference of the base info

(cherry picked from commit c74ba1bc36)
2020-11-09 15:22:11 +02:00
Avi Kivity
bb49a5ac06 Merge 'storage_proxy: add a separate smp_group for hints' from Eliran
Hints writes are handled by storage_proxy in the exact same way
regular writes are, which in turn means that the same smp service
group is used for both. The problem is that it can lead to a priority
inversion where writes of the lower priority  kind occupies a lot of
the semaphores units making the higher priority writes wait for an
empty slot.
This series adds a separate smp group for hints as well as a field
to pass the correct smp group to mutate_locally functions, and
then uses this field to properly classify the writes.

Fixes #7177

* eliransin-hint_priority_inversion:
  Storage proxy: use hints smp group in mutate locally
  Storage proxy: add a dedicated smp group for hints

(cherry picked from commit c075539fea)

[avi: replace std::bind_front() which is not available with this
      compiler with a lambda that does the same]
2020-11-08 20:46:45 +02:00
Pavel Solodovnikov
947d3a13a3 storage_proxy: un-hardcode force sync flag for mutate_locally(mutation) overload
Corresponding overload of `storage_proxy::mutate_locally`
was hardcoded to pass `db::commitlog::force_sync::no` to the
`database::apply`. Unhardcode it and substitute `force_sync::no`
to all existing call sites (as it were before).

`force_sync::yes` will be used later for paxos learn writes
when trying to apply mutations upgraded from an obsolete
schema version (similar to the current case when applying
locally a `frozen_mutation` stored in accepted proposal).

Tests: unit(dev)

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200716124915.464789-1-pa.solodovnikov@scylladb.com>
(cherry picked from commit 5ff5df1afd)

Prerequisite for #7177.
2020-11-08 19:47:11 +02:00
Amnon Heiman
b096d64aa7 scyllatop/livedata.py: Safe iteration over metrics
This patch change the code that iterates over the metrics to use a copy
of the metrics names to make it safe to remove the metrics from the
metrics object.

Fixes #7488

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 52db99f25f)
2020-11-08 19:16:25 +02:00
Calle Wilund
ce8a0f3886 partition_version: Change range_tombstones() to return chunked_vector
Refs #7364

The number of tombstones can be large. As a stopgap measure to
just returning a source range (with keepalive), we can at least
alleviate the problem by using a chunked vector.

Closes #7433

(cherry picked from commit 4b65d67a1a)
2020-11-08 14:38:45 +02:00
Tomasz Grabiec
41344d8ee6 sstables: ka/la: Fix abort when next_partition() is called with certain reader state
Cleanup compaction is using consume_pausable_in_thread() to skip over
disowned partitions, which uses flat_mutation_reader::next_partition().

The implementation of next_partition() for the sstable reader has a
bug which may cause the following assertion failure:

  scylla: sstables/mp_row_consumer.hh:422: row_consumer::proceed sstables::mp_row_consumer_k_l::flush(): Assertion `!_ready' failed.

This happens when the sstable reader's buffer gets full when we reach
the partition end. The last fragment of the partition won't be pushed
into the buffer but will stay in the _ready variable. When
next_partition() is called in this state, _ready will not be cleared
and the fragment will be carried over to the next partition. This will
cause assertion failure when the reader attempts to emit the first
fragment of the next partition.

The fix is to clear _ready when entering a partition, just like we
clear _range_tombstones there.

Fixes #7553.
Message-Id: <1604534702-12777-1-git-send-email-tgrabiec@scylladb.com>

(cherry picked from commit fb9b5cae05)
2020-11-08 14:32:58 +02:00
Avi Kivity
db6303dba0 Merge "Fix TWCS compaction aggressiveness due to data segregation" from Raphael
"
After data segregation feature, anything that cause out-of-order writes,
like read repair, can result in small updates to past time windows.
This causes compaction to be very aggressive because whenever a past time
window is updated like that, that time window is recompacted into a
single SSTable.
Users expect that once a window is closed, it will no longer be written
to, but that has changed since the introduction of the data segregation
future. We didn't anticipate the write amplification issues that the
feature would cause. To fix this problem, let's perform size-tiered
compaction on the windows that are no longer active and were updated
because data was segregated. The current behavior where the last active
window is merged into one file is kept. But thereafter, that same
window will only be compacted using STCS.

Fixes #6928.
"

* 'fix_twcs_agressiveness_after_data_segregation_v2' of github.com:raphaelsc/scylla:
  compaction/twcs: improve further debug messages
  compaction/twcs: Improve debug log which shows all windows
  test: Check that TWCS properly performs size-tiered compaction on past windows
  compaction/twcs: Make task estimation take into account the size-tiered behavior
  compaction/stcs: Export static function that estimates pending tasks
  compaction/stcs: Make get_buckets() static
  compact/twcs: Perform size-tiered compaction on past time windows
  compaction/twcs: Make strategy easier to extend by removing duplicated knowledge
  compaction/twcs: Make newest_bucket() non-static
  compaction/twcs: Move TWCS implementation into source file

(cherry picked from commit 6f986df458)
2020-11-05 20:32:42 +02:00
Glauber Costa
964cbb95a7 twcs: move implementations to its own file
LCS and SCTS already have their own files, reducing the clutter in
compaction_strategy.cc. Do the same for TWCS. I am doing this in
preparation to add more functions.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200611230906.409023-6-glauber@scylladb.com>
(cherry picked from commit b0a0c207c3)

Prerequisite for #6928.
2020-11-05 20:20:30 +02:00
Avi Kivity
b34a1d9576 Merge 'Move temporaries to value view' from Piotr S
"
Issue https://github.com/scylladb/scylla/issues/7019 describes a problem of an ever-growing map of temporary values stored in query_options. In order to mitigate this kind of problems, the storage for temporary values is moved from an external data structure to the value views itself. This way, the temporary lives only as long as it's accessible and is automatically destroyed once a request finishes. The downside is that each temporary is now allocated separately, while previously they were bundled in a single byte stream.

Tests: unit(dev)
Fixes https://github.com/scylladb/scylla/issues/7019
"

7055297649 ("cql3: remove query_options::linearize and _temporaries")
is reverted from this backport since linearize() is still used in
this branch.

* psarna-move_temporaries_to_value_view:
  cql3: remove query_options::linearize and _temporaries
  cql3: remove make_temporary helper function
  cql3: store temporaries in-place instead of in query_options
  cql3: add temporary_value to value view
  cql3: allow moving data out of raw_value
  cql3: split values.hh into a .cc file

(cherry picked from commit 2b308a973f)
2020-11-05 19:48:01 +02:00
Piotr Sarna
15ef930268 schema_tables: fix fixing old secondary index schemas
Old secondary index schemas did not have their idx_token column
marked as computed, and there already exists code which updates
them. Unfortunately, the fix itself contains an error and doesn't
fire if computed columns are not yet supported by the whole cluster,
which is a very common situation during upgrades.

Fixes #7515

Closes #7516

(cherry picked from commit b66c285f94)
2020-11-05 17:53:28 +02:00
Avi Kivity
fe57128fe0 Merge 'Fix ignoring cells after null in appending hash' from Piotr Sarna
"
This series fixes a bug in `appending_hash<row>` that caused it to ignore any cells after the first NULL. It also adds a cluster feature which starts using the new hashing only after the whole cluster is aware of it. The series comes with tests, which reproduce the issue.

Fixes #4567
Based on #4574
"

* psarna-fix_ignoring_cells_after_null_in_appending_hash:
  test: extend mutation_test for NULL values
  tests/mutation: add reproducer for #4567
  gms: add a cluster feature for fixed hashing
  digest: add null values to row digest
  mutation_partition: fix formatting
  appending_hash<row>: make publicly visible

(cherry picked from commit 0e03c979d2)
2020-11-04 20:45:06 +02:00
Yaron Kaikov
b80dab6d58 release: prepare for 4.1.9 2020-10-26 18:13:22 +02:00
Botond Dénes
04d52631b2 reader_permit: reader_resources: make true RAII class
Currently in all cases we first deduct the to-be-consumed resources,
then construct the `reader_resources` class to protect it (release it on
destruction). This is error prone as it relies on no exception being
thrown while constructing the `reader_resources`. Albeit the
`reader_resources` constructor is `noexcept` right now this might change
in the future and as the call sites relying on this are disconnected
from the declaration, the one modifying them might not notice.
To make this safe going forward, make the `reader_resources` a true RAII
class, consuming the units in its constructor and releasing them in its
destructor.

Refs: #7256

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200922150625.1253798-1-bdenes@scylladb.com>
(cherry picked from commit a0107ba1c6)
Message-Id: <20200924081408.236353-1-bdenes@scylladb.com>
2020-10-19 15:04:53 +03:00
Takuya ASADA
dfc9f789cf install.sh: set LC_ALL=en_US.UTF-8 on python3 thunk
scylla-python3 causes segfault when non-default locale specified.
As workaround for this, we need to set LC_ALL=en_US.UTF_8 on python3 thunk.

Fixes #7408

Closes #7414

(cherry picked from commit ff129ee030)
2020-10-18 15:02:46 +03:00
Avi Kivity
c1236c02df Update seastar submodule
* seastar 88b6f0172c...d4df4fa6de (1):
  > append_challenged_posix_file_impl: allow destructing file with no queued work

Fixes #7285.
2020-10-12 15:13:17 +03:00
Gleb Natapov
0eb2f5c378 lwt: do not return unavailable exception from the 'learn' stage
Unavailable exception means that operation was not started and it can be
retried safely. If lwt fails in the learn stage though it most
certainly means that its effect will be observable already. The patch
returns timeout exception instead which means uncertainty.

Fixes #7258

Message-Id: <20201001130724.GA2283830@scylladb.com>
(cherry picked from commit 3e8dbb3c09)
2020-10-07 11:00:08 +02:00
Avi Kivity
0cc6d41ee6 Merge "materialized views: Fix undefined behavior on base table schema changes" from Tomasz
"
The view_info object, which is attached to the schema object of the
view, contains a data structure called
"base_non_pk_columns_in_view_pk". This data structure contains column
ids of the base table so is valid only for a particular version of the
base table schema. This data structure is used by materialized view
code to interpret mutations of the base table, those coming from base
table writes, or reads of the base table done as part of view updates
or view building.

The base table schema version of that data structure must match the
schema version of the mutation fragments, otherwise we hit undefined
behavior. This may include aborts, exceptions, segfaults, or data
corruption (e.g. writes landing in the wrong column in the view).

Before this patch, we could get schema version mismatch here after the
base table was altered. That's because the view schema did not change
when the base table was altered.

Another problem was that view building was using the current table's schema
to interpret the fragments and invoke view building. That's incorrect for two
reasons. First, fragments generated by a reader must be accessed only using
the reader's schema. Second, base_non_pk_columns_in_view_pk of the recorded
view ptrs may not longer match the current base table schema, which is used
to generate the view updates.

Part of the fix is to extract base_non_pk_columns_in_view_pk into a
third entity called base_dependent_view_info, which changes both on
base table schema changes and view schema changes.

It is managed by a shared pointer so that we can take immutable
snapshots of it, just like with schema_ptr. When starting the view
update, the base table schema_ptr and the corresponding
base_dependent_view_info have to match. So we must obtain them
atomically, and base_dependent_view_info cannot change during update.

Also, whenever the base table schema changes, we must update
base_dependent_view_infos of all attached views (atomically) so that
it matches the base table schema.

Fixes #7061.

Tests:

  - unit (dev)
  - [v1] manual (reproduced using scylla binary and cqlsh)
"

* tag 'mv-schema-mismatch-fix-v2' of github.com:tgrabiec/scylla:
  db: view: Refactor view_info::initialize_base_dependent_fields()
  tests: mv: Test dropping columns from base table
  db: view: Fix incorrect schema access during view building after base table schema changes
  schema: Call on_internal_error() when out of range id is passed to column_at()
  db: views: Fix undefined behavior on base table schema changes
  db: views: Introduce has_base_non_pk_columns_in_view_pk()

(cherry picked from commit 3daa49f098)
2020-10-06 16:49:08 +03:00
Juliusz Stasiewicz
1ecc447f42 tracing: Fix error on slow batches
`trace_keyspace_helper::make_slow_query_mutation_data` expected a
"query" key in its parameters, which does not appear in case of
e.g. batches of prepared statements. This is example of failing
`record.parameters`:
```
...{"query[0]" : "INSERT INTO ks.tbl (pk, i) values (?, ?);"},
{"query[1]" : "INSERT INTO ks.tbl (pk, i) values (?, ?);"}...
```

In such case Scylla recorded no trace and said:
```
ERROR 2020-09-28 10:09:36,696 [shard 3] trace_keyspace_helper - No
"query" parameter set for a session requesting a slow_query_log record
```

Fix here is to leave query empty if not found. The users can still
retrieve the query contents from existing info.

Fixes #5843

Closes #7293

(cherry picked from commit 0afa738a8f)
2020-10-04 18:04:42 +03:00
Tomasz Grabiec
7f3ffbc1c8 Merge "evictable_reader: validate buffer on reader recreation" from Botond
This series backports the evictable reader validation patchset (merged
as 97c99ea9f to master) to 4.1.

I only had to do changes to the tests.

Tests: unit(dev), some exception safety tests are failing with or
without my patchset

* https://github.com/denesb/scylla.git denesb/evictable-reader-validate-buffer/backport-4.1:
  mutation_reader_test: add unit test for evictable reader self-validation
  evictable_reader: validate buffer after recreation the underlying
  evictable_reader: update_next_position(): only use peek'd position on partition boundary
  mutation_reader_test: add unit test for evictable reader range tombstone trimming
  evictable_reader: trim range tombstones to the read clustering range
  position_in_partition_view: add position_in_partition_view before_key() overload
  flat_mutation_reader: add buffer() accessor
2020-10-02 11:50:29 +02:00
Botond Dénes
6a02d120ec mutation_reader_test: add unit test for evictable reader self-validation
Add both positive (where the validation should succeed) and negative
(where the validation should fail) tests, covering all validation cases.

(cherry picked from commit 076c27318b)
2020-10-02 09:45:20 +03:00
Botond Dénes
d820997452 evictable_reader: validate buffer after recreation the underlying
The reader recreation mechanism is a very delicate and error-prone one,
as proven by the countless bugs it had. Most of these bugs were related
to the recreated reader not continuing the read from the expected
position, inserting out-of-order fragments into the stream.
This patch adds a defense mechanism against such bugs by validating the
start position of the recreated reader. Several things are checked:
* The partition is the expected one -- the one we were in the middle of
  or the next if we stopped at partition boundaries.
* The partition is in the read range.
* The first fragment in the partition is the expected one -- has a
  an equal or larger position than the next expected fragment.
* The fragment is in the clustering range as defined by the slice.

As these validations are only done on the slow-path of recreating an
evicted reader, no performance impact is expected.

(cherry picked from commit 0b0ae18a14)
2020-10-02 09:38:04 +03:00
Botond Dénes
e1e57d224b evictable_reader: update_next_position(): only use peek'd position on partition boundary
`evictable_reader::update_next_position()` is used to record the position the
reader will continue from, in the next buffer fill. This position is used to
create the partition slice when the underlying reader is evicted and has
to be recreated. There is an optimization in this method -- if the
underlying's buffer is not empty we peek at the first fragment in it and
use it as the next position. This is however problematic for buffer
validation on reader recreation (introduced in the next patch), because
using the next row's position as the next pos will allow for range
tombstones to be emitted with before_key(next_pos.key()), which will
trigger the validation. Instead of working around this, just drop this
optimization for mid-partition positions, it is inconsequential anyway.
We keep it for where it is important, when we detect that we are at a
partition boundary. In this case we can avoid reading the current
partition altogether when recreating the reader.

(cherry picked from commit 91020eef73)
2020-10-02 09:38:04 +03:00
Botond Dénes
763e063356 mutation_reader_test: add unit test for evictable reader range tombstone trimming
(cherry picked from commit d1b0573e1c)
2020-10-02 09:37:57 +03:00
Botond Dénes
a8f966aafa evictable_reader: trim range tombstones to the read clustering range
Currently mutation sources are allowed to emit range tombstones that are
out-of the clustering read range if they are relevant to it. For example
a read of a clustering range [ck100, +inf), might start with:

    range_tombstone{start={ck1, -1}, end={ck200, 1}},
    clustering_row{ck100}

The range tombstone is relevant to the range and the first row of the
range so it is emitted as first, but its position (start) is outside the
read range. This is normally fine, but it poses a problem for evictable
reader. When the underlying reader is evicted and has to be recreated
from a certain clustering position, this results in out-of-order
mutation fragments being inserted into the middle of the stream. This is
not fine anymore as the monotonicity guarantee of the stream is
violated. The real solution would be to require all mutation sources to
trim range tombstones to their read range, but this is a lot of work.
Until that is done, as a workaround we do this trimming in the evictable
reader itself.

(cherry picked from commit 4f2e7a18e2)
2020-10-02 08:59:55 +03:00
Botond Dénes
1a3c8a0ec5 position_in_partition_view: add position_in_partition_view before_key() overload
(cherry picked from commit d7d93aef49)
2020-10-02 08:59:55 +03:00
Botond Dénes
268821223c flat_mutation_reader: add buffer() accessor
To allow outsiders to inspect the contents of the reader's buffer.

(cherry picked from commit ab59e7c725)
2020-10-02 08:59:55 +03:00
Tomasz Grabiec
6c43a0dc29 schema: Fix race in schema version recalculation leading to stale schema version in gossip
Migration manager installs several feature change listeners:

    if (this_shard_id() == 0) {
        _feature_listeners.push_back(_feat.cluster_supports_view_virtual_columns().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_digest_insensitive_to_expiry().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_cdc().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_per_table_partitioners().when_enabled(update_schema));
    }

They will call update_schema_version_and_announce() when features are enabled, which does this:

    return update_schema_version(proxy, features).then([] (utils::UUID uuid) {
        return announce_schema_version(uuid);
    });

So it first updates the schema version and then publishes it via
gossip in announce_schema_version(). It is possible that the
announce_schema_version() part of the first schema change will be
deferred and will execute after the other four calls to
update_schema_version_and_announce(). It will install the old schema
version in gossip instead of the more recent one.

The fix is to serialize schema digest calculation and publishing.

Fixes #7200

(cherry picked from commit 1a57d641d1)
2020-10-01 18:18:21 +02:00
Yaron Kaikov
8399aac6bc release: prepare for 4.1.8 2020-09-28 20:25:06 +03:00
Avi Kivity
b1a70d0ad4 Update sesatar submodule
* seastar 15cd93729f...88b6f0172c (1):
  > lz4_fragmented_compressor: Fix buffer requirements

Fixes #6925.
2020-09-23 11:55:54 +03:00
Yaron Kaikov
2251a1c577 release: prepare for 4.1.7 2020-09-17 21:30:34 +03:00
Nadav Har'El
f8c7c485d2 alternator: fix corruption of PutItem operation in case of contention
This patch fixes a bug noted in issue #7218 - where PutItem operations
sometimes lose part of the item's data - some attributes were lost,
and the name of other attributes replaced by empty strings. The problem
happened when the write-isolation policy was LWT and there was contention
of writes to the same partition (not necessarily the same item).

To use CAS (a.k.a. LWT), Alternator builds an alternator::rmw_operation
object with an apply() function which takes the old contents of the item
(if needed) and a timestamp, and builds a mutation that the CAS should
apply. In the case of the PutItem operation, we wrongly assumed that apply()
will be called only once - so as an optimization the strings saved in the
put_item_operation were moved into the returned mutation. But this
optimization is wrong - when there is contention, apply() may be called
again when the changed proposed by the previous one was not accepted by
the Paxos protocol.

The fix is to change the one place where put_item_operation *moved* strings
out of the saved operations into the mutations, to be a copy. But to prevent
this sort of bug from reoccuring in future code, this patch enlists the
compiler to help us verify that it can't happen: The apply() function is
marked "const" - it can use the information in the operation to build the
mutation, but it can never modify this information or move things out of it,
so it will be fine to call this function twice.

The single output field that apply() does write (_return_attributes) is
marked "mutable" to allow the const apply() to write to it anyway. Because
apply() might be called twice, it is important that if some apply()
implementation sometimes sets _return_attributes, then it must always
set it (even if to the default, empty, value) on every call to apply().

The const apply() means that the compiler verfies for us that I didn't
forget to fix additional wrong std::move()s. Additionally, a test I wrote
to easily reproduce issue #7218 (which I will submit as a dtest later)
passes after this fix.

Fixes #7218.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200916064906.333420-1-nyh@scylladb.com>
(cherry picked from commit 5e8bdf6877)
2020-09-16 21:26:59 +03:00
Benny Halevy
d60bed1953 test: cql_query_test: test_cache_bypass: use table stats
test is currently flaky since system reads can happen
in the background and disturb the global row cache stats.

Use the table's row_cache stats instead.

Fixes #6773

Test: cql_query_test.test_cache_bypass(dev, debug)

Credit-to: Botond Dénes <bdenes@scylladb.com>
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200811140521.421813-1-bhalevy@scylladb.com>
(cherry picked from commit 6deba1d0b4)
2020-09-16 18:19:30 +03:00
Dejan Mircevski
259203a394 cql3: Fix NULL reference in get_column_defs_for_filtering
There was a typo in get_column_defs_for_filtering(): it checked the
wrong pointer before dereferencing.  Add a test exposing the NULL
dereference and fix the typo.

Tests: unit (dev)

Fixes #7198.

Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
(cherry picked from commit 9d02f10c71)
2020-09-16 15:47:04 +03:00
Avi Kivity
5f284633d4 reconcilable_result_builder: don't aggrevate out-of-memory condition during recovery
Consider an unpaged query that consumes all of available memory, despite
fea5067dfa which limits them (perhaps the
user raised the limit, or this is a system query). Eventually we will see a
bad_alloc which will abort the query and destroy this reconcilable_result_builder.

During destruction, we first destroy _memory_accounter, and then _result.
Destroying _memory_accounter resumes some continuations which can then
allocate memory synchronously when increasing the task queue to accomodate
them. We will then crash. Had we not crashed, we would immediately afterwards
release _result, freeing all the memory that we would ever need.

Fix by making _result the last member, so it is freed first.

Fixes #7240.

(cherry picked from commit 9421cfded4)
2020-09-16 15:40:58 +03:00
Asias He
66cc4be8f6 storage_service: Fix a TOKENS update race for replace operation
In commit 7d86a3b208 (storage_service:
Make replacing node take writes), application state of TOKENS of the
replacing node is added into gossip and propagated to the cluster after
the initial start of gossip service. This can cause a race below

1. The replacing node replaces the old dead node with the same ip address
2. The replacing node starts gossip without application state of the TOKENS
3. Other nodes in the cluster replace the application states of old dead node's
   version with the new replacing node's version
4. replacing node dies
5. replace operation is performed again, the TOKENS application state is
   not preset and replace operation fails.

To fix, we can always add TOKENS application state when the
gossip service starts.

Fixes: #7166
Backports: 4.1 and 4.2
(cherry picked from commit 3ba6e3d264)
2020-09-10 13:13:58 +03:00
Avi Kivity
9ca6aa5535 Merge "Fix repair stalls in get_sync_boundary and apply_rows_on_master_in_thread" from Asias
"
This path set fixes stalls in repair that are caused by std::list merge and clear operations during test_latency_read_with_nemesis test.

Fixes #6940
Fixes #6975
Fixes #6976
"

* 'fix_repair_list_stall_merge_clear_v2' of github.com:asias/scylla:
  repair: Fix stall in apply_rows_on_master_in_thread and apply_rows_on_follower
  repair: Use clear_gently in get_sync_boundary to avoid stall
  utils: Add clear_gently
  repair: Use merge_to_gently to merge two lists
  utils: Add merge_to_gently

(cherry picked from commit 4547949420)
2020-09-10 13:13:54 +03:00
Avi Kivity
6e63db8c72 repair: apply_rows_on_follower(): remove copy of repair_rows list
We copy a list, which was reported to generate a 15ms stall.

This is easily fixed by moving it instead, which is safe since this is
the last use of the variable.

Fixes #7115.

(cherry picked from commit 6ff12b7f79)
2020-09-10 11:53:29 +03:00
Avi Kivity
803da18727 Update seastar submodule
* seastar 18275cbc0e...15cd93729f (1):
  > core/reactor: complete_timers(): restore previous scheduling group

Fixes #7184.
2020-09-07 11:33:06 +03:00
Raphael S. Carvalho
165d89860e compaction: Prevent non-regular compaction from picking compacting SSTables
After 8014c7124, cleanup can potentially pick a compacting SSTable.
Upgrade and scrub can also pick a compacting SSTable.
The problem is that table::candidates_for_compaction() was badly named.
It misleads the user into thinking that the SSTables returned are perfect
candidates for compaction, but manager still need to filter out the
compacting SSTables from the returned set. So it's being renamed.

When the same SSTable is compacted in parallel, the strategy invariant
can be broken like overlapping being introduced in LCS, and also
some deletion failures as more than one compaction process would try
to delete the same files.

Let's fix scrub, cleanup and ugprade by calling the manager function
which gets the correct candidates for compaction.

Fixes #6938.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200811200135.25421-1-raphaelsc@scylladb.com>
(cherry picked from commit 11df96718a)
2020-09-06 18:40:56 +03:00
Takuya ASADA
4a5116a0ae aws: update enhanced networking supported instance list
Sync enhanced networking supported instance list to latest one.

Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html

Fixes #6991

(cherry picked from commit 7cccb018b8)
2020-09-06 18:21:28 +03:00
Yaron Kaikov
6d9ff622df release: prepare for 4.1.6 2020-08-30 21:34:46 +03:00
Nadav Har'El
65bc33c921 redis: fix another use-after-free crash in "exists" command
Never trust Occam's Razor - it turns out that the use-after-free bug in the
"exists" command was caused by two separate bugs. We fixed one in commit
9636a33993, but there is a second one fixed in
this patch.

The problem fixed here was that a "service_permit" object, which is designed to
be copied around from place to place (it contains a shared pointer, so is cheap
to copy), was saved by reference, and the reference was to a function argument
and was destroyed prematurely.

This time I tested *many times* that that test_strings.py passes on both dev and
debug builds.

Note that test/run/redis still fails in a debug build, but due to a different
problem.

Fixes #6469

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200825183313.120331-1-nyh@scylladb.com>
(cherry picked from commit 868194cd17)
2020-08-27 12:25:03 +03:00
Nadav Har'El
5e90f06ca2 redis: fix use-after-free crash in "exists" command
A missing "&" caused the key stored in a long-living command to be copied
and the copy quickly freed - and then used after freed.
This caused the test test_strings.py::test_exists_multiple_existent_key for
this feature to frequently crash.

Fixes #6469

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200823190141.88816-1-nyh@scylladb.com>
(cherry picked from commit 9636a33993)
2020-08-27 12:25:03 +03:00
Asias He
2036de3245 compaction_manager: Avoid stall in perform_cleanup
The following stall was seen during a cleanup operation:

scylla: Reactor stalled for 16262 ms on shard 4.

| std::_MakeUniq<locator::tokens_iterator_impl>::__single_object std::make_unique<locator::tokens_iterator_impl, locator::tokens_iterator_impl&>(locator::tokens_iterator_impl&) at /usr/include/fmt/format.h:1158
|  (inlined by) locator::token_metadata::tokens_iterator::tokens_iterator(locator::token_metadata::tokens_iterator const&) at ./locator/token_metadata.cc:1602
| locator::simple_strategy::calculate_natural_endpoints(dht::token const&, locator::token_metadata&) const at simple_strategy.cc:?
|  (inlined by) locator::simple_strategy::calculate_natural_endpoints(dht::token const&, locator::token_metadata&) const at ./locator/simple_strategy.cc:56
| locator::abstract_replication_strategy::get_ranges(gms::inet_address, locator::token_metadata&) const at /usr/include/fmt/format.h:1158
| locator::abstract_replication_strategy::get_ranges(gms::inet_address) const at /usr/include/fmt/format.h:1158
| service::storage_service::get_ranges_for_endpoint(seastar::basic_sstring<char, unsigned int, 15u, true> const&, gms::inet_address const&) const at /usr/include/fmt/format.h:1158
| service::storage_service::get_local_ranges(seastar::basic_sstring<char, unsigned int, 15u, true> const&) const at /usr/include/fmt/format.h:1158
|  (inlined by) operator() at ./sstables/compaction_manager.cc:691
|  (inlined by) _M_invoke at /usr/include/c++/9/bits/std_function.h:286
| std::function<std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable> > > (table const&)>::operator()(table const&) const at /usr/include/fmt/format.h:1158
|  (inlined by) compaction_manager::rewrite_sstables(table*, sstables::compaction_options, std::function<std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable> > > (table const&)>) at ./sstables/compaction_manager.cc:604
| compaction_manager::perform_cleanup(table*) at /usr/include/fmt/format.h:1158

To fix, we furturize the function to get local ranges and sstables.

In addition, this patch removes the dependency to global storage_service object.

Fixes #6662

(cherry picked from commit 07e253542d)
2020-08-27 12:25:03 +03:00
Raphael S. Carvalho
0924e4d92f sstables: optimize procedure that checks if a sstable needs cleanup
needs_cleanup() returns true if a sstable needs cleanup.

Turns out it's very slow because it iterates through all the local
ranges for all sstables in the set, making its complexity:
	O(num_sstables * local_ranges)

We can optimize it by taking into account that abstract_replication_strategy
documents that get_ranges() will return a list of ranges that is sorted
and non-overlapping. Compaction for cleanup already takes advantage of that
when checking if a given partition can be actually purged.

So needs_cleanup() can be optimized into O(num_sstables * log(local_ranges)).

With num_sstables=1000, RF=3, then local_ranges=256(num_tokens)*3, it means
the max # of checks performed will go from 768000 to ~9584.

Fixes #6730.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200629171355.45118-2-raphaelsc@scylladb.com>
(cherry picked from commit cf352e7c14)
2020-08-27 12:25:03 +03:00
Raphael S. Carvalho
b8313775c5 sstables: export needs_cleanup()
May be needed elsewhere, like in an unit test.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200629171355.45118-1-raphaelsc@scylladb.com>
(cherry picked from commit a9eebdc778)
2020-08-27 12:25:02 +03:00
Asias He
ec0002a67f abstract_replication_strategy: Add get_ranges_in_thread
Add a version that runs inside a seastar thread. The benefit is that
get_ranges can yield to avoid stalls.

Refs #6662

(cherry picked from commit 94995acedb)
2020-08-27 12:24:55 +03:00
Asias He
ebdf5f9e55 gossip: Fix race between shutdown message handler and apply_state_locally
1. The node1 is shutdown
2. The node1 sends shutdown message to node2
3. The node2 receives gossip shutdown message but the handler yields
4. The node1 is restarted
5. The node1 sends new gossip endpoint_state to node2, node2 applies the state
   in apply_state_locally and calls gossiper::handle_major_state_change
   and then calls gossiper::mark_alive
6. The shutdown message handler in step 3 resumes and sets status of node1 to SHUTDOWN
7. The gossiper::mark_alive fiber in step 5 resumes and calls gossiper::real_mark_alive,
   node2 will skip to mark node1 as alive because the status of node1 is
   SHUTDOWN. As a result, node1 is alive but it is not marked as UP by node2.

To fix, we serialize the two operations.

Fixes #7032

(cherry picked from commit e6ceec1685)
2020-08-27 11:15:59 +03:00
Nadav Har'El
32c0e4f110 alternator test: configurable temporary directory
The test/alternator/run script creates a temporary directory for the Scylla
database in /tmp. The assumption was that this is the fastest disk (usually
even a ramdisk) on the test machine, and we didn't need anything else from
it.

But it turns out that on some systems, /tmp is actually a slow disk, so
this patch adds a way to configure the temporary directory - if the TMPDIR
environment variable exists, it is used instead of /tmp. As before this
patch, a temporary subdirectry is created in $TMPDIR, and this subdirectory
is automatically deleted when the test ends.

The test.py script already passes an appropriate TMPDIR (testlog/$mode),
which after this patch the Alternator test will use instead of /tmp.

Fixes #6750

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200713193023.788634-1-nyh@scylladb.com>
(cherry picked from commit 8e3be5e7d6)
2020-08-26 19:37:38 +03:00
Nadav Har'El
5f48444a98 alternator: fix order conditions on binary attributes
We implemented the order operators (LT, GT, LE, GE, BETWEEN) incorrectly
for binary attributes: DynamoDB requires that the bytes be treated as
unsigned for the purpose of order (so byte 128 is higher than 127), but
our implementation uses Scylla's "bytes" type which has signed bytes.

The solution is simple - we can continue to use the "bytes" type, but
we need to use its compare_unsigned() function, not its "<" operator.

This bug affected conditional operations ("Expected" and
"ConditionExpression") and also filters ("QueryFilter", "ScanFilter",
"FilterExpression"). The bug did *not* affect Query's key conditions
("KeyConditions", "KeyConditionExpression") because those already
used Scylla's key comparison functions - which correctly compare binary
blobs as unsigned bytes (in fact, this is why we have the
compare_unsigned() function).

The patch also adds tests that reproduce the bugs in conditional
operations, and show that the bug did not exist in key conditions.

Fixes #6573

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200603084257.394136-1-nyh@scylladb.com>
(cherry picked from commit f6b1f45d69)
Manually removed tests in test_key_conditions.py that did not exist in this branch
2020-08-26 18:47:28 +03:00
Avi Kivity
8930ea5407 Merge "Unregister RPC verbs on stop" from Pavel E
"
There are 5 services, that register their RPC handlers in messaging
service, but quite a few of them unregister them on stop.

Unregistering is somewhat critical, not just because it makes the
code look clean, but also because unregistration does wait for the
message processing to complete, thus avoiding use-after-free's in
the handlers.

In particular, several handlers call service::get_schema_for_write()
which, in turn, may end up in service::maybe_sync() calling for
the local migration manager instance. All those handlers' processing
must be waited for before stopping the migration manager.

The set brings the RPC handlers unregistration in sync with the
registration part.

tests: unit (dev)
       dtest (dev: simple_boot_shutdown, repair)
       start-stop by hands (dev)
fixes: #6904
"

* 'br-rpc-unregister-verbs' of https://github.com/xemul/scylla:
  main: Add missing calls to unregister RPC hanlers
  messaging: Add missing per-service unregistering methods
  messaging: Add missing handlers unregistration helpers
  streaming: Do not use db->invoke_on_all in vain
  storage_proxy: Detach rpc unregistration from stop
  main: Shorten call to storage_proxy::init_messaging_service

(cherry picked from commit 01b838e291)
2020-08-26 14:42:17 +03:00
Raphael S. Carvalho
311cd6403c cql3/statements: verify that counter column cannot be added into non-counter table
A check, to validate that counter column cannot be added into non-counter table,
is missing for alter table statement. Validation is performed when building new
schema, but it's limited to checking that a schema will not contain both counter
and non-counter columns.

Due to lack of validation, the added counter column could be incorrectly
persisted to the schema, but this results in a crash when setting the new
schema to its table. On restart, it can be confirmed that the schema change
was indeed persisted when describing the table.
This problem is fixed by doing proper validation for the alter table statement,
which consists of making sure a new counter column cannot be added to a
non-counter table.

The test cdc_disallow_cdc_for_counters_test is adjusted because one of its tests
was built on the assumption that counter column can be added into a non-counter
table.

Fixes #7065.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200824155709.34743-1-raphaelsc@scylladb.com>
(cherry picked from commit 1c29f0a43d)
2020-08-25 18:45:30 +03:00
Takuya ASADA
b71821435a dist/debian: disable debuginfo compression on .deb
Since older binutils on some distribution does not able to handle
compressed debuginfo generated on Fedora, we need to disable it.
However, debian packager force debuginfo compression since debian/compat = 9,
we have to uncompress them after compressed automatically.

Fixes #6982

(cherry picked from commit 75c2362c95)
2020-08-23 19:02:57 +03:00
Botond Dénes
cd29e2643c scylla-gdb.py: find_db(): don't return current shard's database for shard=0
The `shard` parameter of `find_db()` is optional and is defaulted to
`None`. When missing, the current shard's database instance is returned.
The problem is that the if condition checking this uses `not shard`,
which also evaluates to `True` if `shard == 0`, resulting in returning
the current shard's database instance for shard 0. Change the condition
to `shard is None` to avoid this.

Fixes: #7016
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200812091546.1704016-1-bdenes@scylladb.com>
(cherry picked from commit 4cfab59eb1)
2020-08-23 18:56:26 +03:00
Avi Kivity
59aa1834a7 Merge "repair: row_level: prevent deadlocks when repairing homogenous nodes" from Botond
"
This series backports the series "repair: row_level: prevent deadlocks
when repairing homogenous nodes" (merged as a9c7a1a86) to branch-4.1.
"

Fixes #6272

* 'repair-row-level-evictable-local-reader/branch-4.1' of https://github.com/denesb/scylla:
  repair: row_level: destroy reader on EOS or error
  repair: row_level: use evictable_reader for local reads
  mutation_reader: expose evictable_reader
  mutation_reader: evictable_reader: add auto_pause flag
  mutation_reader: make evictable_reader a flat_mutation_reader
  mutation_reader: s/inactive_shard_read/inactive_evictable_reader/
  mutation_reader: move inactive_shard_reader code up
  mutation_reader: fix indentation
  mutation_reader: shard_reader: extract remote_reader as evictable_reader
  mutation_reader: reader_lifecycle_policy: make semaphore() available early
2020-08-23 18:06:12 +03:00
Botond Dénes
436b305286 view_update_generator: fix race between registering and processing sstables
fea83f6 introduced a race between processing (and hence removing)
sstables from `_sstables_with_tables` and registering new ones. This
manifested in sstables that were added concurrently with processing a
batch for the same sstables being dropped and the semaphore units
associated with them not returned. This resulted in repairs being
blocked indefinitely as the units of the semaphore were effectively
leaked.

This patch fixes this by moving the contents of `_sstables_with_tables`
to a local variable before starting the processing. A unit test
reproducing the problem is also added.

Fixes: #6892

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200817160913.2296444-1-bdenes@scylladb.com>
(cherry picked from commit 22a6493716)
2020-08-23 18:04:29 +03:00
Botond Dénes
1d85051e8d repair: row_level: destroy reader on EOS or error
To avoid having to make it an optional with all the additional checks,
we just replace it with an empty reader instead, this also also achieves
the desired effect of releasing the read permit and all the associated
resources early.

(cherry picked from commit fbbc86e18c)
2020-08-20 16:10:16 +03:00
Botond Dénes
3f52d8733b repair: row_level: use evictable_reader for local reads
Row level repair, when using a local reader, is prone to deadlocking on
the streaming reader concurrency semaphore. This has been observed to
happen with at least two participating nodes, running more concurrent
repairs than the maximum allowed amount of reads by the concurrency
semaphore. In this situation, it is possible that two repair instances,
competing for the last available permits on both nodes, get a permit on
one of the nodes and get queued on the other one respectively. As
neither will let go of the permit it already acquired, nor give up
waiting on the failed-to-acquired permit, a deadlock happens.

To prevent this, we make the local repair reader evictable. For this we
reuse the newly exposed evictable reader.
The repair reader is paused after the repair buffer is filled, which is
currently 32MB, so the cost of a possible reader recreation is amortized
over 32MB read.

The repair reader is said to be local, when it can use the shard-local
partitioner. This is the case if the participating nodes are homogenous
(their shard configuration is identical), that is the repair instance
has to read just from one shard. A non-local reader uses the multishard
reader, which already makes its shard readers evictable and hence is not
prone to the deadlock described here.

(cherry picked from commit 080f00b99a)
2020-08-20 16:10:16 +03:00
Botond Dénes
eece444547 mutation_reader: expose evictable_reader
Expose functions for the outside world to create evictable readers. We
expose two functions, which create an evictable reader with
`auto_pause::yes` and `auto_pause::no` respectively. The function
creating the latter also returns a handle in addition to the reader,
which can be used to pause the reader.

(cherry picked from commit 542d9c3711)
2020-08-20 16:10:16 +03:00
Botond Dénes
2ab51c4055 mutation_reader: evictable_reader: add auto_pause flag
Currently the evictable reader unconditionally pauses the underlying
reader after each use (`fill_buffer()` or `fast_forward_to()` call).
This is fine for current users (the multishard reader), but the future
user we are doing all this refactoring for -- repair -- will want to
control when the underlying reader is paused "manually". Both these
behaviours can easily be supported in a single implementation, so we
add an `auto_pause` flag to allow the creator of the evictable reader
to control this.

(cherry picked from commit 1cc31deff9)
2020-08-20 16:10:16 +03:00
Botond Dénes
4a1a1feb55 mutation_reader: make evictable_reader a flat_mutation_reader
The `evictable_reader` class is almost a proper flat mutation reader
already, it roughly offers the same interface. This patch makes this
formal: changing the class to inherit from `flat_mutation_reader::impl`,
and implement all virtual methods. This also entails a departure from
using the lifecycle policy to pause/resume and create readers, instead
using more general building blocks like the reader concurrency semaphore
and a mutation source.

(cherry picked from commit af9e1c23e1)
2020-08-20 16:10:16 +03:00
Botond Dénes
76995933e0 mutation_reader: s/inactive_shard_read/inactive_evictable_reader/
Rename `inactive_shard_read` to `inactive_evictable_reader` to reflect
that the fact that the evictable reader is going to be of general use,
not specific to the multishard reader.

(cherry picked from commit 4485864ada)
2020-08-20 16:10:16 +03:00
Botond Dénes
f840263fdd mutation_reader: move inactive_shard_reader code up
It will be used by the `evictable_reader` code too in the next patches.

(cherry picked from commit b6ed054c08)
2020-08-20 16:10:16 +03:00
Botond Dénes
b4887ce4a5 mutation_reader: fix indentation
Deferred from the previous patch.

(cherry picked from commit e3ea1c9080)
2020-08-20 16:10:16 +03:00
Botond Dénes
849e12bf2e mutation_reader: shard_reader: extract remote_reader as evictable_reader
We want to make the evictable reader mechanism used in the multishard
reader pipeline available for general (re)use, as a standalone
flat mutation reader implementation. The first step is extracting
`shard_reader::remote_reader` the class implementing this logic into a
top-level class, also renamed to `evictable_reader`.

(cherry picked from commit f9d1916499)
2020-08-20 16:10:16 +03:00
Botond Dénes
f124f97f99 mutation_reader: reader_lifecycle_policy: make semaphore() available early
Currently all reader lifecycle policy implementations assume that
`semaphore()` will only be called after at least one call to
`make_reader()`. This assumption will soon not hold, so make sure
`semaphore()` can be called at any time, including before any calls are
made to `make_reader()`.

(cherry picked from commit 63309f925c)
2020-08-20 16:10:16 +03:00
Botond Dénes
4ee0b489cf table: get_sstables_by_partition_key(): don't make a copy of selected sstables
Currently we assign the reference to the vector of selected sstables to
`auto sst`. This makes a copy and we pass this local variable to
`do_for_each()`, which will result in a use-after-free if the latter
defers.
Fix by not making a copy and instead just keep the reference.

Fixes: #7060

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200818091241.2341332-1-bdenes@scylladb.com>
(cherry picked from commit 78f94ba36a)
2020-08-19 00:02:01 +03:00
Yaron Kaikov
382dcb9d34 release: prepare for 4.1.5 2020-08-17 21:55:06 +03:00
Calle Wilund
07b7df9171 cdc::log: Missing "preimage" check in row deletion pre-image
Fixes #6561

Pre-image generation in row deletion case only checked if we had a pre-image
result set row. But that can be from post-image. Also check actual existance
of the pre-image CK.
Message-Id: <20200608132804.23541-1-calle@scylladb.com>

(cherry picked from commit 5105e9f5e1)
2020-08-12 13:52:45 +03:00
Nadav Har'El
7fa3a988e3 Update Seastar submodule
> http: add "Expect: 100-continue" handling

Fixes #6844
2020-08-11 13:16:16 +03:00
Asias He
7b23574224 repair: Switch to btree_set for repair_hash.
In one of the longevity tests, we observed 1.3s reactor stall which came from
repair_meta::get_full_row_hashes_source_op. It traced back to a call to
std::unordered_set::insert() which triggered big memory allocation and
reclaim.

I measured std::unordered_set, absl::flat_hash_set, absl::node_hash_set
and absl::btree_set. The absl::btree_set was the only one that seastar
oversized allocation checker did not warn in my tests where around 300K
repair hashes were inserted into the container.

- unordered_set:
hash_sets=295634, time=333029199 ns

- flat_hash_set:
hash_sets=295634, time=312484711 ns

- node_hash_set:
hash_sets=295634, time=346195835 ns

- btree_set:
hash_sets=295634, time=341379801 ns

The btree_set is a bit slower than unordered_set but it does not have
huge memory allocation. I do not measure real difference of total time
to finish repair of the same dataset with unordered_set and btree_set.

To fix, switch to absl btree_set container.

Fixes #6190

(cherry picked from commit 67f6da6466)
(cherry picked from commit a27188886a)
2020-08-11 12:34:26 +03:00
Rafael Ávila de Espíndola
ac207c892b build: Link with abseil
It is a pity we have to list so many libraries, but abseil doesn't
provide a .pc file.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 7d1f6725dd)

Ref #6190.
2020-08-11 12:34:26 +03:00
Rafael Ávila de Espíndola
a023b3bb7a Add abseil as a submodule
This adds the https://abseil.io library as a submodule. The patch
series that follows needs a hash table that supports heterogeneous
lookup, and abseil has a really good hash table that supports that
(https://abseil.io/blog/20180927-swisstables).

The library is still not available in Fedora, but it is fairly easy to
use it directly from a submodule.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 383a9c6da9)

Ref #6190
2020-08-11 12:34:26 +03:00
Rafael Ávila de Espíndola
0b9db42d9c cofigure: Don't overwrite seastar_cflags
The variable seastar_cflags was being used for flags passed to seastar
and for flags extracted from the seastar.pc file.

This introduces a new variable for the flags extracted from the
seastar.pc file.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 2ad09aefb6)

Ref #6190.
2020-08-11 12:34:26 +03:00
Calle Wilund
df8d4482c5 database: Do not assert on replay positions if truncate does not flush
Fixes #6995

In c2c6c71 the assert on replay positions in flushed sstables discarded by
truncate was broken, by the fact that we no longer flush all sstables
unless auto snapshot is enabled.

This means the low_mark assertion does not hold, because we maybe/probably
never got around to creating the sstables that would hold said mark.

Note that the (old) change to not create sstables and then just delete
them is in itself good. But in that case we should not try to verify
the rp mark.

(cherry picked from commit 9620755c7f)
2020-08-10 23:33:39 +03:00
Avi Kivity
442d7bf9ff Update seastar submodule
* seastar c9c1dc5fa7...1337f1158b (1):
  > memory: fix small aligned free memory corruption

Fixes #6831
2020-08-09 18:37:32 +03:00
Avi Kivity
bc6422d16d Merge 'hinted handoff: fix commitlog memory leak' from Piotr D
"
When commitlog is recreated in hints manager, only shutdown() method is
called, but not release(). Because of that, some internal commitlog
objects (`segment_manager` and `segment`s) may be left pointing to each
other through shared_ptr reference cycles, which may result in memory
leak when the parent commitlog object is destroyed.

This PR prevents memory leaks that may happen this way by calling
release() after shutdown() from the hints manager.

Fixes: #6409, Fixes #6776
"

* piodul-fix-commitlog-memory-leak-in-hinted-handoff:
  hinted handoff: disable warnings about segments left on disk
  hinted handoff: release memory on commitlog termination

(cherry picked from commit 4c221855a1)
2020-08-09 17:25:57 +03:00
Yaron Kaikov
76f4bc4c6f release: prepare for 4.1.4 2020-08-09 08:49:19 +03:00
Tomasz Grabiec
dc4efb0a1e thrift: Fix crash on unsorted column names in SlicePredicate
The column names in SlicePredicate can be passed in arbitrary order.
We converted them to clustering ranges in read_command preserving the
original order. As a result, the clustering ranges in read command may
appear out of order. This violates storage engine's assumptions and
lead to undefined behavior.

It was seen manifesting as a SIGSEGV or an abort in sstable reader
when executing a get_slice() thrift verb:

scylla: sstables/consumer.hh:476: seastar::future<> data_consumer::continuous_data_consumer<StateProcessor>::fast_forward_to(size_t, size_t) [with StateProcessor = sstables::data_consume_rows_context_m; size_t = long unsigned int]: Assertion `end >= _stream_position.position' failed.

Fixes #6486.

Tests:

   - added a new dtest to thrift_tests.py which reproduces the problem

Message-Id: <1596725657-15802-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit bfd129cffe)
2020-08-08 19:48:25 +03:00
Rafael Ávila de Espíndola
f699d23f0b alternator: Fix use after return
Avoid a copy of timeout so that we don't end up with a reference to a
stack allocated variable.

Fixes #6897

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200721184939.111665-1-espindola@scylladb.com>
(cherry picked from commit e83e91e352)
2020-08-03 22:36:37 +03:00
Nadav Har'El
d5e5a6fe48 alternator: fix Expected's "NULL" operator with missing AttributeValueList
The "NULL" operator in Expected (old-style conditional operations) doesn't
have any parameters, so we insisted that the AttributeValueList be empty.
However, we forgot to allow it to also be missing - a possibility which
DynamoDB allows.

This patch adds a test to reproduce this case (the test passes on DyanmoDB,
fails on Alternator before this patch, and succeeds after this patch), and
a fix.

Fixes #6816.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200709161254.618755-1-nyh@scylladb.com>
(cherry picked from commit f549d147ea)
2020-08-03 20:42:15 +03:00
Takuya ASADA
5a43c6ec81 scylla_util.py: always use relocatable CLI tools
On some CLI tools, command options may different between latest version
vs older version.
To maximize compatibility of setup scripts, we should always use
relocatable CLI tools instead of distribution version of the tool.

Related #6954

(cherry picked from commit a19a62e6f6)
2020-08-03 10:41:57 +03:00
Takuya ASADA
2aae8bb206 create-relocatable-package.py: add lsblk for relocatable CLI tools
We need latest version of lsblk that supported partition type UUID.

Fixes #6954

(cherry picked from commit 6ba2a6c42e)
2020-08-03 10:41:52 +03:00
Juliusz Stasiewicz
c206399379 aggregate_fcts: Use per-type comparators for dynamic types
For collections and UDTs the `MIN()` and `MAX()` functions are
generated on the fly. Until now they worked by comparing just the
byte representations of arguments.

This patch uses specific per-type comparators to provide semantically
sensible, dynamically created aggregates.

Fixes #6768

(cherry picked from commit 5b438e79be)
2020-08-03 10:26:15 +03:00
Calle Wilund
787b324916 cql3::lists: Fix setter_by_uuid not handing null value
Fixes #6828

When using the scylla list index from UUID extension,
null values were not handled properly causing throws
from underlying layer.

(cherry picked from commit 3b74b9585f)
2020-08-03 10:20:14 +03:00
Takuya ASADA
dfe90a69f5 scylla_post_install.sh: generate memory.conf for CentOS7
On CentOS7, systemd does not support percentage-based parameter.
To apply memory parameter on CentOS7, we need to override the parameter
in bytes, instead of percentage.

Fixes #6783

(cherry picked from commit 3a25e7285b)
2020-07-30 16:41:24 +03:00
Tomasz Grabiec
d03d6f41c2 commitlog: Fix use-after-free on mutation object during replay
The mutation object may be freed prematurely during commitlog replay
in the schema upgrading path. We will hit the problem if the memtable
is full and apply_in_memory() needs to defer.

This will typically manifest as a segfault.

Fixes #6953

Introduced in 79935df

Tests:
  - manual using scylla binary. Reproduced the problem then verified the fix makes it go away

Message-Id: <1596044010-27296-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit 3486eba1ce)
2020-07-30 16:36:55 +03:00
Avi Kivity
0e86f1bf66 dist: debian: do not require root during package build
Debian package builds provide a root environment for the installation
scripts, since that's what typical installation scripts expect. To
avoid providing actual root, a "fakeroot" system is used where syscalls
are intercepted and any effect that requires root (like chown) is emulated.

However, fakeroot sporadically fails for us, aborting the package build.
Since our install scripts don't really require root (when operating in
the --packaging mode), we can just tell dpkg-buildpackage that we don't
need fakeroot. This ought to fix the sporadic failures.

As a side effect, package builds are faster.

Fixes #6655.

(cherry picked from commit b608af870b)
2020-07-29 16:03:33 +03:00
Takuya ASADA
392a007b3a scylla_setup: skip boot partition
On GCE, /dev/sda14 reported as unused disk but it's BIOS boot partition,
should not use for scylla data partition, also cannot use for it since it's
too small.

It's better to exclude such partiotion from unsed disk list.

Fixes #6636

(cherry picked from commit d7de9518fe)
2020-07-29 09:50:19 +03:00
Asias He
254b898cd8 repair: Fix race between create_writer and wait_for_writer_done
We saw scylla hit user after free in repair with the following procedure during tests:

- n1 and n2 in the cluster

- n2 ran decommission

- n2 sent data to n1 using repair

- n2 was killed forcely

- n1 tried to remove repair_meta for n1

- n1 hit use after free on repair_meta object

This was what happened on n1:

1) data was received -> do_apply_rows was called -> yield before create_writer() was called

2) repair_meta::stop() was called -> wait_for_writer_done() / do_wait_for_writer_done was called
   with _writer_done[node_idx] not engaged

3) step 1 resumed, create_writer() was called and _repair_writer object was referenced

4) repair_meta::stop() finished, repair_meta object and its member _repair_writer was destroyed

5) The fiber created by create_writer() at step 3 hit use after free on _repair_writer object

To fix, we should call wait_for_writer_done() after any pending
operations were done which were protected by repair_meta::_gate. This
prevents wait for writer done finishes before the writer is in the
process of being created.

Fixes: #6853
Fixes: #6868
Backports: 4.0, 4.1, 4.2
(cherry picked from commit e6f640441a)
2020-07-29 09:50:15 +03:00
Raphael S. Carvalho
6fb84ed7e0 sstable: index_reader: Make sure streams are all properly closed on failure
Turns out the fix f591c9c710 wasn't enough to make sure all input streams
are properly closed on failure.
It only closes the main input stream that belongs to context, but it misses
all the input streams that can be opened in the consumer for promote index
reading. Consumer stores a list of indexes, where each of them has its own
input stream. On failure, we need to make sure that every single one of
them is properly closed before destroying the indexes as that could cause
memory corruption due to read ahead.

Fixes #6924.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200727182214.377140-1-raphaelsc@scylladb.com>
(cherry picked from commit 0d70efa58e)
2020-07-29 09:48:48 +03:00
Yaron Kaikov
9002592ee0 release: prepare for 4.1.3 2020-07-29 08:26:06 +03:00
Botond Dénes
5d6a7272e7 sstables: clamp estimated_partitions to [1, +inf) in writers
In some cases estimated number of partitions can be 0, which is albeit a
legit estimation result, breaks many low-level sstable writer code, so
some of these have assertions to ensure estimated partitions is > 0.
To avoid hitting this assert all users of the sstable writers do the
clamping, to ensure estimated partitions is at least 1. However leaving
this to the callers is error prone as #6913 has shown it. As this
clamping is standard practice, it is better to do it in the writers
themselves, avoiding this problem altogether. This is exactly what this
patch does. It also adds two unit tests, one that reproduces the crash
in #6913, and another one that ensures all sstable writers are fine with
estimated partitions being 0 now. Call sites previously doing the
clamping are changed to not do it, it is unnecessary now as the writer
does it itself.

Fixes #6913

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200724120227.267184-1-bdenes@scylladb.com>
[avi: adjust sstable_datafile_test's use of compaction_descriptor and make_permit]
(cherry picked from commit fe127a2155)
2020-07-28 09:55:34 +03:00
Piotr Sarna
96625fa54b Merge 'view_update_generator: use partitioned sstable set'
from Botond.

Recently it was observed (#6603) that since 4e6400293ea, the staging
reader is reading from a lot of sstables (200+). This consumes a lot of
memory, and after this reaches a certain threshold -- the entire memory
amount of the streaming reader concurrency semaphore -- it can cause a
deadlock within the view update generation. To reduce this memory usage,
we exploit the fact that the staging sstables are usually disjoint, and
use the partitioned sstable set to create the staging reader. This
should ensure that only the minimum number of sstable readers will be
opened at any time.

Refs: #6603
Fixes: #6707

Tests: unit(dev)

* 'view-update-generator-use-partitioned-set/v1' of https://github.com/denesb/scylla:
  db/view: view_update_generator: use partitioned sstable set
  sstables: make_partitioned_sstable_set(): return an sstable_set

(cherry picked from commit e4b74356bb)
2020-07-21 15:41:46 +03:00
Raphael S. Carvalho
4f5f404619 table: Fix Staging SSTables being incorrectly added or removed from the backlog tracker
Staging SSTables can be incorrectly added or removed from the backlog tracker,
after an ALTER TABLE or TRUNCATE, because the add and removal don't take
into account if the SSTable requires view building, so a Staging SSTable can
be added to the tracker after a ALTER table, or removed after a TRUNCATE,
even though not added previously, potentially causing the backlog to
become negative.

Fixes #6798.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200716180737.944269-1-raphaelsc@scylladb.com>
(cherry picked from commit b67066cae2)
2020-07-21 12:57:28 +03:00
Asias He
cd4502ee64 repair: Relax size check of get_row_diff and set_diff
In case a row hash conflict, a hash in set_diff will get more than one
row from get_row_diff.

For example,

Node1 (Repair master):
row1  -> hash1
row2  -> hash2
row3  -> hash3
row3' -> hash3

Node2 (Repair follower):
row1  -> hash1
row2  -> hash2

We will have set_diff = {hash3} between node1 and node2, while
get_row_diff({hash3}) will return two rows: row3 and row3'. And the
error below was observed:

   repair - Got error in row level repair: std::runtime_error
   (row_diff.size() != set_diff.size())

In this case, node1 should send both row3 and row3' to peer node
instead of fail the whole repair. Because node2 does not have row3 or
row3', otherwise node1 won't send row with hash3 to node1 in the first
place.

Refs: #6252
(cherry picked from commit a00ab8688f)
2020-07-15 14:49:09 +03:00
Hagit Segev
3e6c6d5f58 release: prepare for 4.1.2 2020-07-14 23:56:02 +03:00
Avi Kivity
564b4c32b0 Update seastar submodule
* seastar 78f626af6c...c9c1dc5fa7 (2):
  > futures: Add a test for a broken promise in a parallel_for_each
  > future: Call set_to_broken_promise earlier

Fixes #6749 (probably).
2020-07-13 20:17:54 +03:00
Dmitry Kropachev
dfafc4e1a9 dist/common/scripts/scylla-housekeeping: wrap urllib.request with try ... except
We could hit "cannot serialize '_io.BufferedReader' object" when request get 404 error from the server
	Now you will get legit error message in the case.

	Fixes #6690

(cherry picked from commit de82b3efae)
2020-07-09 18:25:16 +03:00
Dejan Mircevski
db286c5ca4 cql/restrictions: Handle WHERE a>0 AND a<0
WHERE clauses with start point above the end point were handled
incorrectly.  When the slice bounds are transformed to interval
bounds, the resulting interval is interpreted as wrap-around (because
start > end), so it contains all values above 0 and all values below
0.  This is clearly incorrect, as the user's intent was to filter out
all possible values of a.

Fix it by explicitly short-circuiting to false when start > end.  Add
a test case.

Fixes #5799.

Tests: unit (dev)

Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
(cherry picked from commit 921dbd0978)
2020-07-08 13:21:00 +03:00
Botond Dénes
519fcd4729 db/view: view_update_generator: re-balance wait/signal on the register semaphore
The view update generator has a semaphore to limit concurrency. This
semaphore is waited on in `register_staging_sstable()` and later the
unit is returned after the sstable is processed in the loop inside
`start()`.
This was broken by 4e64002, which changed the loop inside `start()` to
process sstables in per table batches, however didn't change the
`signal()` call to return the amount of units according to the number of
sstables processed. This can cause the semaphore units to dry up, as the
loop can process multiple sstables per table but return just a single
unit. This can also block callers of `register_staging_sstable()`
indefinitely as some waiters will never be released as under the right
circumstances the units on the semaphore can permanently go below 0.
In addition to this, 4e64002 introduced another bug: table entries from
the `_sstables_with_tables` are never removed, so they are processed
every turn. If the sstable list is empty, there won't be any update
generated but due to the unconditional `signal()` described above, this
can cause the units on the semaphore to grow to infinity, allowing
future staging sstables producers to register a huge amount of sstables,
causing memory problems due to the amount of sstable readers that have
to be opened (#6603, #6707).
Both outcomes are equally bad. This patch fixes both issues and modifies
the `test_view_update_generator` unit test to reproduce them and hence
to verify that this doesn't happen in the future.

Fixes: #6774
Refs: #6707
Refs: #6603

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200706135108.116134-1-bdenes@scylladb.com>
(cherry picked from commit 5ebe2c28d1)
2020-07-08 12:00:12 +03:00
Juliusz Stasiewicz
9bcbcbbcf2 counters: Read the state under timeout
Counter update is a RMW operation. Until now the "Read" part was
not guarded by a timeout, which is changed in this patch.

Fixes #5069

(cherry picked from commit e04fd9f774)
2020-07-07 20:45:01 +03:00
Takuya ASADA
c622e5bfab scylla_setup: don't add same disk device twice
We shouldn't accept adding same disk twice for RAID prompt.

Fixes #6711

(cherry picked from commit 835e76fdfc)
2020-07-07 13:08:22 +03:00
Nadav Har'El
905643bbc2 docker: add option to start Alternator with HTTPS
We already have a docker image option to enable alternator on an unencrypted
port, "--alternator-port", but we forgot to also allow the similar option
for enabling alternator on an encrypted (HTTPS) port: "--alternator-https-port"
so this patch adds the missing option, and documents how to use it.

Note that using this option is not enough. When this option is used,
Alternator also requires two files, /etc/scylla/scylla.crt and
/etc/scylla/scylla.key, to be inserted into the image. These files should
contain the SSL certificate, and key, respectively. If these files are
missing, you will get an error in the log about the missing file.

Fixes #6583.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200621125219.12274-1-nyh@scylladb.com>
(cherry picked from commit e4eca5211a)
2020-07-06 08:22:22 +02:00
Juliusz Stasiewicz
d396a298d6 cdc: Fix segfault when stream ID key is too short
When a token is calculated for stream_id, we check that the key is
exactly 16 bytes long. If it's not - `minimum_token` is returned
and client receives empty result.

This used to be the expected behavior for empty keys; now it's
extended to keys of any incorrect length.

Fixes #6570

(cherry picked from commit 8628ede009)
2020-07-05 15:09:44 +03:00
Asias He
1d9bbbc957 boot_strapper: Ignore node to be replaced explicitly as stream source
After commit 7d86a3b208 (storage_service:
Make replacing node take writes), during replace operation, tokens in
_token_metadata for node being replaced are updated only after the replace
operation is finished. As a result, in range_streamer::add_ranges, the
node being replaced will be considered as a source to stream data from.

Before commit 7d86a3b208, the node being
replaced will not be considered as a source node because it is already
replaced by the replacing node before the replace operation is finished.
This is the reason why it works in the past.

To fix, filter out the node being replaced as a source node explicitly.

Tests: replace_first_boot_test and replace_stopped_node_test
Backports: 4.1
Fixes: #6728
(cherry picked from commit e338028b7e22b0a80be7f80c337c52f958bfe1d7)
2020-07-01 14:35:28 +03:00
Raphael S. Carvalho
4f1878803e compaction: Fix the 2x disk space requirement in SSTable upgrade
SSTable upgrade is requiring 2x the space of input SSTables because
we aren't releasing references of the SSTables that were already
upgraded. So if we're upgrading 1TB, it means that up to 2TB may be
required for the upgrade operation to succeed.

That can be fixed by moving all input SSTables when rewrite_sstables()
asks for the set of SSTables to be compacted, so allowing their space
to be released as soon as there is no longer any ref to them.

Spotted while auditting code.

Fixes #6682.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200619205701.92891-1-raphaelsc@scylladb.com>
(cherry picked from commit 52180f91d4)
2020-07-01 12:36:52 +03:00
Avi Kivity
c5e2fad1c8 Merge "Fix handling of decimals with negative scales" from Rafael
"
Before this series scylla would effectively infinite loop when, for
example, casting a decimal with a negative scale to float.

Fixes #6720
"

* 'espindola/fix-decimal-issue' of https://github.com/espindola/scylla:
  big_decimal: Add a test for a corner case
  big_decimal: Correctly handle negative scales
  big_decimal: Add a as_rational member function
  big_decimal: Move constructors out of line

(cherry picked from commit 3e2eeec83a)
2020-06-29 12:05:39 +03:00
Hagit Segev
abd0fa52c0 release: prepare for 4.1.1 2020-06-25 08:06:32 +03:00
Piotr Sarna
dfa464c35b alternator: fix propagating tags
Updating tags was erroneously done locally, which means that
the schema change was not propagated to other nodes.
The new code announces new schema globally.

Fixes #6513
Branches: 4.0,4.1
Tests: unit(dev)
       dtest(alternator_tests.AlternatorTest.test_update_condition_expression_and_write_isolation)
Message-Id: <3a816c4ecc33c03af4f36e51b11f195c231e7ce1.1592935039.git.sarna@scylladb.com>

(cherry picked from commit f4e8cfe03b)
2020-06-24 13:56:09 +03:00
Avi Kivity
be29b35c4b Merge 'range_streamer: Handle table of RF 1 in get_range_fetch_map' from Asias
"
After "Make replacing node take writes" series, with repair based node
operations disabled, we saw the replace operation fail like:

```
[shard 0] init - Startup failed: std::runtime_error (unable to find
sufficient sources for streaming range (9203926935651910749, +inf) in
keyspace system_auth)
```
The reason is the system_auth keyspace has default RF of 1. It is
impossible to find a source node to stream from for the ranges owned by
the replaced node.

In the past, the replace operation with keyspace of RF 1 passes, because
the replacing node calls token_metadata.update_normal_tokens(tokens,
ip_of_replacing_node) before streaming. We saw:

```
[shard 0] range_streamer - Bootstrap : keyspace system_auth range
(-9021954492552185543, -9016289150131785593] exists on {127.0.0.6}
```

Node 127.0.0.6 is the replacing node 127.0.0.5. The source node check in
range_streamer::get_range_fetch_map will pass if the source is the node
itself. However, it will not stream from the node itself. As a result,
the system_auth keyspace will not get any data.

After the "Make replacing node take writes" series, the replacing node
calls token_metadata.update_normal_tokens(tokens, ip_of_replacing_node)
after the streaming finishes. We saw:

```
[shard 0] range_streamer - Bootstrap : keyspace system_auth range
(-9049647518073030406, -9048297455405660225] exists on {127.0.0.5}
```

Since 127.0.0.5 was dead, the source node check failed, so the bootstrap
operation.

Ta fix, we ignore the table of RF 1 when it is unable to find a source
node to stream.

Fixes #6351
"

* asias-fix_bootstrap_with_rf_one_in_range_streamer:
  range_streamer: Handle table of RF 1 in get_range_fetch_map
  streaming: Use separate streaming reason for replace operation

(cherry picked from commit 9afd599d7c)
2020-06-23 13:53:03 +03:00
Asias He
97b7024c0c streaming: Do not send end of stream in case of error
Current sender sends stream_mutation_fragments_cmd::end_of_stream to
receiver when an error is received from a peer node. To be safe, send
stream_mutation_fragments_cmd::error instead of
stream_mutation_fragments_cmd::end_of_stream to prevent end_of_stream to
be written into the sstable when a partition is not closed yet.

In addition, use mutation_fragment_stream_validator to valid the
mutation fragments emitted from the reader, e.g., check if
partition_start and partition_end are paired when the reader is done. If
not, fail the stream session and send
stream_mutation_fragments_cmd::error instead of
stream_mutation_fragments_cmd::end_of_stream to isolate the problematic
sstables on the sender node.

Refs: #6478
(cherry picked from commit a521c429e1)
2020-06-23 12:47:35 +03:00
Alejo Sanchez
194ff1d226 lwt: validate before constructing metadata
LWT batches conditions can't span multiple tables.
This was detected in batch_statement::validate() called in ::prepare().
But ::cas_result_set_metadata() was built in the constructor,
causing a bitset assert/crash in a reported scenario.
This patch moves validate() to the constructor before building metadata.

Closes #6332

Tested with https://github.com/scylladb/scylla-dtest/pull/1465

[avi: adjust spelling of exception message to 4.1 spelling]

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
(cherry picked from commit d1521e6721)
2020-06-21 18:20:41 +03:00
Gleb Natapov
b8f7fb35e1 cql transport: do not log broken pipe error when a client closes its side of a connection abruptly
Fixes #5661

Message-Id: <20200615075958.GL335449@scylladb.com>
(cherry picked from commit 7ca937778d)
2020-06-21 13:08:58 +03:00
Amnon Heiman
f7d53ff607 api/storage_service.cc: stream result of token_range
The get token range API can become big which can cause large allocation
and stalls.

This patch replace the implementation so it would stream the results
using the http stream capabilities instead of serialization and sending
one big buffer.

Fixes #6297

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 7c4562d532)
2020-06-21 12:57:15 +03:00
Rafael Ávila de Espíndola
eb190643f8 configure: Reduce the dynamic linker path size
gdb has a SO_NAME_MAX_PATH_SIZE of 512, so we use that as the path
size.

Fixes: #6494

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200528202741.398695-2-espindola@scylladb.com>
(cherry picked from commit aa778ec152)
2020-06-21 12:26:51 +03:00
Piotr Sarna
3f8345f1b8 alternator: fix the return type of PutItem
Even if there are no attributes to return from PutItem requests,
we should return a valid JSON object, not an empty string.

Fixes #6568
Tests: unit(dev)

(cherry picked from commit 8fc3ca855e)
2020-06-21 12:21:19 +03:00
Piotr Sarna
891a3fa243 alternator: fix returning UnprocessedKeys unconditionally
Client libraries (e.g. PynamoDB) expect the UnprocessedKeys
and UnprocessedItems attributes to appear in the response
unconditionally - it's hereby added, along with a simple test case.

Fixes #6569
Tests: unit(dev)

(cherry picked from commit 3aff52f56e)
2020-06-21 12:19:18 +03:00
Tomasz Grabiec
db31542805 row_cache: Fix undefined behavior on key linearization
This is relevant only when using partition or clustering keys which
have a representation in memory which is larger than 12.8 KB (10% of
LSA segment size).

There are several places in code (cache, background garbage
collection) which may need to linearize keys because of performing key
comparison, but it's not done safely:

 1) the code does not run with the LSA region locked, so pointers may
get invalidated on linearization if it needs to reclaim memory. This
is fixed by running the code inside an allocating section.

 2) LSA region is locked, but the scope of
with_linearized_managed_bytes() encloses the allocating section. If
allocating section needs to reclaim, linearization context will
contain invalidated pointers. The fix is to reorder the scopes so
that linearization context lives within an allocating section.

Example of 1 can be found in
range_populating_reader::handle_end_of_stream() where it performs a
lookup:

  auto prev = std::prev(it);
  if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
     it->set_continuous(true);

but handle_end_of_stream() is not invoked under allocating section.

Example of 2 can be found in mutation_cleaner_impl::merge_some() where
it does:

  return with_linearized_managed_bytes([&] {
  ...
    return _worker_state->alloc_section(region, [&] {

Fixes #6637.
Refs #6108.

Tests:

  - unit (all)

Message-Id: <1592218544-9435-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit e81fc1f095)
2020-06-21 11:56:31 +03:00
Yaron Kaikov
b443b2574a release: prepare for 4.1.0 2020-06-18 14:42:57 +03:00
Asias He
2ee321d88e gossip: Do not send shutdown message when a node is in unknown status
When a replacing node is in early boot up and is not in HIBERNATE sate
yet, if the node is killed by a user, the node will wrongly send a
shutdown message to other nodes. This is because UNKNOWN is not in
SILENT_SHUTDOWN_STATES, so in gossiper::do_stop_gossiping, the node will
send shutdown message. Other nodes in the cluster will call
storage_service::handle_state_normal for this node, since NORMAL and
SHUTDOWN status share the same status handler. As a result, other nodes
will incorrectly think the node is part of the cluster and the replace
operation is finished.

Such problem was seen in replace_node_no_hibernate_state_test dtest:

   n1, n2 are in the cluster
   n2 is dead
   n3 is started to replace n2, but n3 is killed in the middle
   n3 announces SHUTDOWN status wrongly
   n1 runs storage_service::handle_state_normal for n3
   n1 get tokens for n3 which is empty, because n3 hasn't gossip tokens yet
   n1 skips update normal tokens for n3,  but think n3 has replaced n2
   n4 starts to replace n2
   n4 checks the tokens for n2 in storage_service::join_token_ring (Cannot
      replace token {} which does not exist!) or
      storage_service::prepare_replacement_info (Cannot replace_address {}
      because it doesn't exist in gossip)

To fix, we add UNKNOWN into SILENT_SHUTDOWN_STATES and avoid sending
shutdown message.

Tests: replace_address_test.py:TestReplaceAddress.replace_node_no_hibernate_state_test
Fixes: #6436
(cherry picked from commit dddde33512)
2020-06-16 15:03:48 +03:00
Avi Kivity
4563f4b992 tools: toolchain: regenerate for gnutls 3.6.14
CVE-2020-13777.

Fixes #6627.

Toolchain source image registry disambiguated due to tighter podman defaults.
2020-06-15 07:49:21 +03:00
Kamil Braun
81dc8eeec7 cdc: rename CDC description tables
Commit 968177da04 has changed the schema
of cdc_topology_description and cdc_description tables in the
system_distributed keyspace.

Unfortunately this was a backwards-incompatible change: these tables
would always be created, irrespective of whether or not "experimental"
was enabled. They just wouldn't be populated with experimental=off.

If the user now tries to upgrade Scylla from a version before this change
to a version after this change, it will work as long as CDC is protected
b the experimental flag and the flag is off.

However, if we drop the flag, or if the user turns experimental on,
weird things will happen, such as nodes refusing to start because they
try to populate cdc_topology_description while assuming a different schema
for this table.

The simplest fix for this problem is to rename the tables. This fix must
get merged in before CDC goes out of experimental.
If the user upgrades his cluster from a pre-rename version, he will simply
have two garbage tables that he is free to delete after upgrading.

sstables and digests need to be regenerated for schema_digest_test since
this commit effectively adds new tables to the system_distributed keyspace.
This doesn't result in schema disagreement because the table is
announced to all nodes through the migration manager.

(cherry picked from commit d89b7a0548)
Fixes #6537.
2020-06-14 09:15:36 +03:00
Raphael S. Carvalho
2d72f7d8e5 compaction: Disable garbage collected writer if interposer consumer is used
GC writer, used for incremental compaction, cannot be currently used if interposer
consumer is used. That's because compaction assumes that GC writer will be operated
only by a single compaction writer at a given point in time.
With interposer consumer, multiple writers will concurrently operate on the same
GC writer, leading to race condition which potentially result in use-after-free.

Let's disable GC writer if interposer consumer is enabled. We're not losing anything
because GC writer is currently only needed on strategies which don't implement an
interposer consumer. Resharding will always disable GC writer, which is the expected
behavior because it doesn't support incremental compaction yet.
The proper fix, which allows GC writer and interposer consumer to work together,
will require more time to implement and test, and for that reason, I am postponing
it as #6472 is a showstopper for the current release.

Fixes #6472.

tests: mode(dev).

[Raphael: Fixed compilation failure in unit test test_bug_6472 for backport]

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Reviewed-by: Glauber Costa <glauber@scylladb.com>
(cherry picked from commit 097a5e9e07)
Message-Id: <20200610203928.86717-1-raphaelsc@scylladb.com>
2020-06-11 13:21:56 +03:00
Takuya ASADA
c6ee86b512 aws: update enhanced networking supported instance list
Sync enhanced networking supported instance list to latest one.

Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html

Fixes #6540

(cherry picked from commit 969c4258cf)
2020-06-09 16:02:09 +03:00
Hagit Segev
67348cd6e8 release: prepare for 4.1.rc2 2020-06-08 16:37:36 +03:00
Israel Fruchter
44cc4843f1 fix "scylla_coredump_setup: Remove the coredump create by the check"
In 28c3d4 `out()` was used without `shell=True` and was the spliting of arguments
failed cause of the complex commands in the cmd (pipe and such)

Fixes #6159

(cherry picked from commit a2bb48f44b)
2020-06-04 20:54:51 +03:00
Israel Fruchter
f1f5586bf6 scylla_coredump_setup: Remove the coredump create by the check
We generate a coredump as part of "scylla_coredump_setup" to verify that
coredumps are working. However, we need to *remove* that test coredump
to avoid people and test infrastructure reporting those coredumps.

Fixes #6159

(cherry picked from commit 28c3d4f8e8)
2020-06-03 16:52:51 +03:00
Amos Kong
3a447cd755 active the coredump directory mount during coredump setup
Currently we use a systemd mount (var-lib-systemd-coredump.mount) to mount
default coredump directory (/var/lib/systemd/coredump) to
(/var/lib/scylla/coredump). The /var/lib/scylla had been mounted to a big
storage, so we will have enough space for coredump after the mount.

Currently in coredump_setup, we only enabled var-lib-systemd-coredump.mount,
but not start it. The directory won't be mounted after coredump_setup, so the
coredump will still be saved to default coredump directory.
The mount will only effect after reboot.

Fixes #6566

(cherry picked from commit abf246f6e5)
2020-06-03 09:25:59 +03:00
Pekka Enberg
176aa91be5 Revert "scylla_coredump_setup: Fix incorrect coredump directory mount"
This reverts commit e77dad3adf because its
incorrect.

Amos explains:

"Quote from https://www.freedesktop.org/software/systemd/man/systemd.mount.html

 What=

   Takes an absolute path of a device node, file or other resource to
   mount. See mount(8) for details. If this refers to a device node, a
   dependency on the respective device unit is automatically created.

 Where=

   Takes an absolute path of a file or directory for the mount point; in
   particular, the destination cannot be a symbolic link. If the mount
   point does not exist at the time of mounting, it is created as
   directory.

 So the mount point is '/var/lib/systemd/coredump' and
 '/var/lib/scylla/coredump' is the file to mount, because /var/lib/scylla
 had mounted a second big storage, which has enough space for Huge
 coredumps.

 Bentsi or other touched problem with old scylla-master AMI, a coredump
 occurred but not successfully saved to disk for enospc.  The directory
 /var/lib/systemd/coredump wasn't mounted to /var/lib/scylla/coredump.
 They WRONGLY thought the wrong mount was caused by the config problem,
 so he posted a fix.

 Actually scylla-ami-setup / coredump wasn't executed on that AMI, err:
 unit scylla-ami-setup.service not found Because
 'scylla-ami-setup.service' config file doesn't exist or is invalid.

 Details of my testing: https://github.com/scylladb/scylla/issues/6300#issuecomment-637324507

 So we need to revert Bentsi's patch, it changed the right config to wrong."

(cherry picked from commit 9d9d54c804)
2020-06-03 09:25:49 +03:00
Avi Kivity
4a3eff17ff Revert "Revert "config: Do not enable repair based node operations by default""
This reverts commit 71d0d58f8c. Repair-based
node operations are still not ready.
2020-06-02 18:08:03 +03:00
Nadav Har'El
2e00f6d0a1 alternator: fix support for bytes type in Query's KeyConditions
Our parsing of values in a KeyConditions paramter of Query was done naively.
As a result, we got bizarre error messages "condition not met: false" when
these values had incorrect type (this is issue #6490). Worse - the naive
conversion did not decode base64-encoded bytes value as needed, so
KeyConditions on bytes-typed keys did not work at all.

This patch fixes these bugs by using our existing utility function
get_key_from_typed_value(), which takes care of throwing sensible errors
when types don't match, and decoding base64 as needed.

Unfortunately, we didn't have test coverage for many of the KeyConditions
features including bytes keys, which is why this issue escaped detection.
A patch will follow with much more comprehensive tests for KeyConditions,
which also reproduce this issue and verify that it is fixed.

Refs #6490
Fixes #6495

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200524141800.104950-1-nyh@scylladb.com>
(cherry picked from commit 6b38126a8f)
2020-05-31 13:53:45 +03:00
Nadav Har'El
bf509c3b16 alternator: add mandatory configurable write isolation mode
Alternator supports four ways in which write operations can use quorum
writes or LWT or both, which we called "write isolation policies".

Until this patch, Alternator defaulted to the most generally safe policy,
"always_use_lwt". This default could have been overriden for each table
separately, but there was no way to change this default for all tables.
This patch adds a "--alternator-write-isolation" configuration option which
allows changing the default.

Moreover, @dorlaor asked that users must *explicitly* choose this default
mode, and not get "always_use_lwt" without noticing. The previous default,
"always_use_lwt" supports any workload correctly but because it uses LWT
for all writes it may be disappointingly slow for users who run write-only
workloads (including most benchmarks) - such users might find the slow
writes so disappointing that they will drop Scylla. Conversely, a default
of "forbid_rmw" will be faster and still correct, but will fail on workloads
which need read-modify-write operations - and suprise users that need these
operations. So Dor asked that that *none* of the write modes be made the
default, and users must make an informed choice between the different write
modes, rather than being disappointed by a default choice they weren't
aware of.

So after this patch, Scylla refuses to boot if Alternator is enabled but
a "--alternator-write-isolation" option is missing.

The patch also modifies the relevant documentation, adds the same option to
our docker image, and the modifies the test-running script
test/alternator/run to run Scylla with the old default mode (always_use_lwt),
which we need because we want to test RMW operations as well.

Fixes #6452

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200524160338.108417-1-nyh@scylladb.com>
(cherry picked from commit c3da9f2bd4)
2020-05-31 13:42:11 +03:00
Avi Kivity
84ef30752f Update seastar submodule
* seastar e708d1df3a...78f626af6c (1):
  > reactor: don't mlock all memory at once

Fixes #6460.
2020-05-31 13:34:42 +03:00
Avi Kivity
f1b71ec216 Point seastar submodule at scylla-seastar.git
This allows us to backport seastar patches to the 4.1 branch.
2020-05-31 13:34:42 +03:00
Piotr Sarna
93ed536fba alternator: wait for schema agreement after table creation
In order to be sure that all nodes acknowledged that a table was
created, the CreateTable request will now only return after
seeing that schema agreement was reached.
Rationale: alternator users check if the table was created by issuing
a DescribeTable request, and assume that the table was correctly
created if it returns nonempty results. However, our current
implementation of DescribeTable returns local results, which is
not enough to judge if all the other nodes acknowledge the new table.
CQL drivers are reported to always wait for schema agreement after
issuing DDL-changing requests, so there should be no harm in waiting
a little longer for alternator's CreateTable as well.

Fixes #6361
Tests: alternator(local)

(cherry picked from commit 5f2eadce09)
2020-05-31 13:18:11 +03:00
Nadav Har'El
ab3da4510c docs, alternator: improve description of status of global tables support
The existing text did not explain what happens if additional DCs are added
to the cluster, so this patch improves the explanation of the status of
our support for global tables, including that issue.

Fixes #6353

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200513175908.21642-1-nyh@scylladb.com>
(cherry picked from commit f3fd976120)
2020-05-31 13:13:13 +03:00
Asias He
bb8fcbff68 repair: Abort the queue in write_end_of_stream in case of error
In write_end_of_stream, it does:

1) Write write_partition_end
2) Write empty mutation_fragment_opt

If 1) fails, 2) will be skipped, the consumer of the queue will wait for
the empty mutation_fragment_opt forever.

Found this issue when injecting random exceptions between 1) and 2).

Refs #6272
Refs #6248

(cherry picked from commit b744dba75a)
2020-05-27 20:11:30 +03:00
Hagit Segev
af43d0c62d release: prepare for 4.1.rc1 2020-05-26 18:57:30 +03:00
Amnon Heiman
8c8c266f67 storage_service: get_range_to_address_map prevent use after free
The implementation of get_range_to_address_map has a default behaviour,
when getting an empty keypsace, it uses the first non-system keyspace
(first here is basically, just a keyspace).

The current implementation has two issues, first, it uses a reference to
a string that is held on a stack of another function. In other word,
there's a use after free that is not clear why we never hit.

The second, it calls get_non_system_keyspaces twice. Though this is not
a bug, it's redundant (get_non_system_keyspaces uses a loop, so calling
that function does have a cost).

This patch solves both issues, by chaning the implementation to hold a
string instead of a reference to a string.

Second, it stores the results from get_non_system_keyspaces and reuse
them it's more efficient and holds the returned values on the local
stack.

Fixes #6465

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 69a46d4179)
2020-05-25 12:48:11 +03:00
Nadav Har'El
6d1301d93c alternator: better error messages when 'forbid_rmw' mode is on
When the 'forbid_rmw' write isolation policy is selected, read-modify-write
are intentionally forbidden. The error message in this case used to say:

	"Read-modify-write operations not supported"

Which can lead users to believe that this operation isn't supported by this
version of Alternator - instead of realizing that this is in fact a
configurable choice.

So in this patch we just change the error message to say:

	"Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information."

Fixes #6421.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200518125538.8347-1-nyh@scylladb.com>
(cherry picked from commit 5ef9854e86)
2020-05-25 08:49:48 +03:00
Tomasz Grabiec
be545d6d5d sstables: index_reader: Fix overflow when calculating promoted index end
When index file is larger than 4GB, offset calculation will overflow
uint32_t and _promoted_index_end will be too small.

As a result, promoted_index_size calculation will underflow and the
rest of the page will be interpretd as a promoted index.

The partitions which are in the remainder of the index page will not
be found by single-partition queries.

Data is not lost.

Introduced in 6c5f8e0eda.

Fixes #6040
Message-Id: <20200521174822.8350-1-tgrabiec@scylladb.com>

(cherry picked from commit a6c87a7b9e)
2020-05-24 09:45:42 +03:00
Rafael Ávila de Espíndola
a1c15f0690 repair: Make sure sinks are always closed
In a recent next failure I got the following backtrace

    function=function@entry=0x270360 "seastar::rpc::sink_impl<Serializer, Out>::~sink_impl() [with Serializer = netw::serializer; Out = {repair_row_on_wire_with_cmd}]") at assert.c:101
    at ./seastar/include/seastar/core/shared_ptr.hh:463
    at repair/row_level.cc:2059

This patch changes a few functions to use finally to make sure the sink
is always closed.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200515202803.60020-1-espindola@scylladb.com>
(cherry picked from commit 311fbe2f0a)

Ref #6414
2020-05-20 09:00:10 +03:00
Asias He
4d68c53389 repair: Fix race between write_end_of_stream and apply_rows
Consider: n1, n2, n1 is the repair master, n2 is the repair follower.

=== Case 1 ===
1) n1 sends missing rows {r1, r2} to n2
2) n2 runs apply_rows_on_follower to apply rows, e.g., {r1, r2}, r1
   is written to sstable, r2 is not written yet, r1 belongs to
   partition 1, r2 belongs to partition 2. It yields after row r1 is
   written.
   data: partition_start, r1
3) n1 sends repair_row_level_stop to n2 because error has happened on n1
4) n2 calls wait_for_writer_done() which in turn calls write_end_of_stream()
   data: partition_start, r1, partition_end
5) Step 2 resumes to apply the rows.
   data: partition_start, r1, partition_end, partition_end, partition_start, r2

=== Case 2 ===
1) n1 sends missing rows {r1, r2} to n2
2) n2 runs apply_rows_on_follower to apply rows, e.g., {r1, r2}, r1
   is written to sstable, r2 is not written yet, r1 belongs to partition
   1, r2 belongs to partition 2. It yields after partition_start for r2
   is written but before _partition_opened is set to true.
   data: partition_start, r1, partition_end, partition_start
3) n1 sends repair_row_level_stop to n2 because error has happened on n1
4) n2 calls wait_for_writer_done() which in turn calls write_end_of_stream().
   Since _partition_opened[node_idx] is false, partition_end is skipped,
   end_of_stream is written.
   data: partition_start, r1, partition_end, partition_start, end_of_stream

This causes unbalanced partition_start and partition_end in the stream
written to sstables.

To fix, serialize the write_end_of_stream and apply_rows with a semaphore.

Fixes: #6394
Fixes: #6296
Fixes: #6414
(cherry picked from commit b2c4d9fdbc)
2020-05-20 08:07:53 +03:00
Piotr Dulikowski
7d1f352be2 hinted handoff: don't keep positions of old hints in rps_set
When sending hints from one file, rps_set field in send_one_file_ctx
keeps track of commitlog positions of hints that are being currently
sent, or have failed to be sent. At the end of the operation, if sending
of some hints failed, we will choose position of the earliest hint that
failed to be sent, and will retry sending that file later, starting from
that position. This position is stored in _last_not_complete_rp.

Usually, this set has a bounded size, because we impose a limit of at
most 128 hints being sent concurrently. Because we do not attempt to
send any more hints after a failure is detected, rps_set should not have
more than 128 elements at a time.

Due to a bug, commitlog positions of old hints (older than
gc_grace_seconds of the destination table) were inserted into rps_set
but not removed after checking their age. This could cause rps_set to
grow very large when replaying a file with old hints.

Moreover, if the file mixed expired and non-expired hints (which could
happen if it had hints to two tables with different gc_grace_seconds),
and sending of some non-expired hints failed, then positions of expired
hints could influence calculation _last_not_complete_rp, and more hints
than necessary would be resent on the next retry.

This simple patch removes commitlog position of a hint from rps_set when
it is detected to be too old.

Fixes #6422

(cherry picked from commit 85d5c3d5ee)
2020-05-20 08:05:51 +03:00
Piotr Dulikowski
0fe5335447 hinted handoff: remove discarded hint positions from rps_set
Related commit: 85d5c3d

When attempting to send a hint, an exception might occur that results in
that hint being discarded (e.g. keyspace or table of the hint was
removed).

When such an exception is thrown, position of the hint will already be
stored in rps_set. We are only allowed to retain positions of hints that
failed to be sent and needed to be retried later. Dropping a hint is not
an error, therefore its position should be removed from rps_set - but
current logic does not do that.

Because of that bug, hint files with many discardable hints might cause
rps_set to grow large when the file is replayed. Furthermore, leaving
positions of such hints in rps_set might cause more hints than necessary
to be re-sent if some non-discarded hints fail to be sent.

This commit fixes the problem by removing positions of discarded hints
from rps_set.

Fixes #6433

(cherry picked from commit 0c5ac0da98)
2020-05-20 08:03:20 +03:00
Avi Kivity
8a026b8b14 Revert "compaction_manager: allow early aborts through abort sources."
This reverts commit e8213fb5c3. It results
in an assertion failure in remove_index_file_test.

Fixes #6413.

(cherry picked from commit 5b971397aa)
2020-05-13 18:26:34 +03:00
Yaron Kaikov
0760107b9f release: prepare for 4.1.rc0 2020-05-11 11:32:01 +03:00
1446 changed files with 5556 additions and 1615 deletions

5
.gitmodules vendored
View File

@@ -1,6 +1,6 @@
[submodule "seastar"]
path = seastar
url = ../seastar
url = ../scylla-seastar
ignore = dirty
[submodule "swagger-ui"]
path = swagger-ui
@@ -12,3 +12,6 @@
[submodule "zstd"]
path = zstd
url = ../zstd
[submodule "abseil"]
path = abseil
url = ../abseil-cpp

View File

@@ -1,7 +1,7 @@
#!/bin/sh
PRODUCT=scylla
VERSION=666.development
VERSION=4.1.11
if test -f version
then

1
abseil Submodule

Submodule abseil added at 2069dc796a

View File

@@ -129,7 +129,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);
auto cl = auth::password_authenticator::consistency_for_user(username);
auto timeout = auth::internal_distributed_timeout_config();
auto& timeout = auth::internal_distributed_timeout_config();
return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
auto res = f.get0();
auto salted_hash = std::optional<sstring>();

View File

@@ -141,6 +141,11 @@ struct nonempty : public size_check {
// Check that array has the expected number of elements
static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
if (!array && expected(0)) {
// If expected() allows an empty AttributeValueList, it is also fine
// that it is missing.
return;
}
if (!array || !array->IsArray()) {
throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
}
@@ -365,31 +370,35 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
struct cmp_lt {
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
// We cannot use the normal comparison operators like "<" on the bytes
// type, because they treat individual bytes as signed but we need to
// compare them as *unsigned*. So we need a specialization for bytes.
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
static constexpr const char* diagnostic = "LT operator";
};
struct cmp_le {
// bytes only has <, so we cannot use <=.
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
static constexpr const char* diagnostic = "LE operator";
};
struct cmp_ge {
// bytes only has <, so we cannot use >=.
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
static constexpr const char* diagnostic = "GE operator";
};
struct cmp_gt {
// bytes only has <, so we cannot use >.
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
static constexpr const char* diagnostic = "GT operator";
};
// True if v is between lb and ub, inclusive. Throws if lb > ub.
template <typename T>
bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
if (ub < lb) {
if (cmp_lt()(ub, lb)) {
throw api_error("ValidationException",
format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
}

View File

@@ -573,29 +573,66 @@ static bool validate_legal_tag_chars(std::string_view tag) {
return std::all_of(tag.begin(), tag.end(), &is_legal_tag_char);
}
static const std::unordered_set<std::string_view> allowed_write_isolation_values = {
"f", "forbid", "forbid_rmw",
"a", "always", "always_use_lwt",
"o", "only_rmw_uses_lwt",
"u", "unsafe", "unsafe_rmw",
};
static void validate_tags(const std::map<sstring, sstring>& tags) {
static const std::unordered_set<std::string_view> allowed_values = {
"f", "forbid", "forbid_rmw",
"a", "always", "always_use_lwt",
"o", "only_rmw_uses_lwt",
"u", "unsafe", "unsafe_rmw",
};
auto it = tags.find(rmw_operation::WRITE_ISOLATION_TAG_KEY);
if (it != tags.end()) {
std::string_view value = it->second;
elogger.warn("Allowed values count {} {}", value, allowed_values.count(value));
if (allowed_values.count(value) == 0) {
if (allowed_write_isolation_values.count(value) == 0) {
throw api_error("ValidationException",
format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_values));
format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
}
}
}
static rmw_operation::write_isolation parse_write_isolation(std::string_view value) {
if (!value.empty()) {
switch (value[0]) {
case 'f':
return rmw_operation::write_isolation::FORBID_RMW;
case 'a':
return rmw_operation::write_isolation::LWT_ALWAYS;
case 'o':
return rmw_operation::write_isolation::LWT_RMW_ONLY;
case 'u':
return rmw_operation::write_isolation::UNSAFE_RMW;
}
}
// Shouldn't happen as validate_tags() / set_default_write_isolation()
// verify allow only a closed set of values.
return rmw_operation::default_write_isolation;
}
// This default_write_isolation is always overwritten in main.cc, which calls
// set_default_write_isolation().
rmw_operation::write_isolation rmw_operation::default_write_isolation =
rmw_operation::write_isolation::LWT_ALWAYS;
void rmw_operation::set_default_write_isolation(std::string_view value) {
if (value.empty()) {
throw std::runtime_error("When Alternator is enabled, write "
"isolation policy must be selected, using the "
"'--alternator-write-isolation' option. "
"See docs/alternator/alternator.md for instructions.");
}
if (allowed_write_isolation_values.count(value) == 0) {
throw std::runtime_error(format("Invalid --alternator-write-isolation "
"setting '{}'. Allowed values: {}.",
value, allowed_write_isolation_values));
}
default_write_isolation = parse_write_isolation(value);
}
// FIXME: Updating tags currently relies on updating schema, which may be subject
// to races during concurrent updates of the same table. Once Scylla schema updates
// are fixed, this issue will automatically get fixed as well.
enum class update_tags_action { add_tags, delete_tags };
static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
if (action == update_tags_action::add_tags) {
for (auto it = tags.Begin(); it != tags.End(); ++it) {
const rjson::value& key = (*it)["Key"];
@@ -622,24 +659,12 @@ static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::ma
}
validate_tags(tags_map);
std::stringstream serialized_tags;
serialized_tags << '{';
for (auto& tag_entry : tags_map) {
serialized_tags << format("'{}':'{}',", tag_entry.first, tag_entry.second);
}
std::string serialized_tags_str = serialized_tags.str();
if (!tags_map.empty()) {
serialized_tags_str[serialized_tags_str.size() - 1] = '}'; // trims the last ',' delimiter
} else {
serialized_tags_str.push_back('}');
}
sstring req = format("ALTER TABLE \"{}\".\"{}\" WITH {} = {}",
schema->ks_name(), schema->cf_name(), tags_extension::NAME, serialized_tags_str);
return db::execute_cql(std::move(req)).discard_result();
schema_builder builder(schema);
builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
}
static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
const rjson::value* tags = rjson::find(request_info, "Tags");
if (!tags || !tags->IsArray()) {
return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
@@ -649,7 +674,7 @@ static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson
}
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
return update_tags(rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
}
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -661,7 +686,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
return api_error("AccessDeniedException", "Incorrect resource identifier");
}
schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
add_tags(_proxy, schema, request).get();
add_tags(_mm, _proxy, schema, request).get();
return json_string("");
});
}
@@ -682,7 +707,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
update_tags(*tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
return json_string("");
});
}
@@ -710,6 +735,17 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
}
static future<> wait_for_schema_agreement(db::timeout_clock::time_point deadline) {
return do_until([deadline] {
if (db::timeout_clock::now() > deadline) {
throw std::runtime_error("Unable to reach schema agreement");
}
return service::get_local_migration_manager().have_schema_agreement();
}, [] {
return seastar::sleep(500ms);
});
}
future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
_stats.api_operations.create_table++;
elogger.trace("Creating table {}", request);
@@ -903,9 +939,11 @@ future<executor::request_return_type> executor::create_table(client_state& clien
}).then([this, table_info = std::move(table_info), schema] () mutable {
future<> f = make_ready_future<>();
if (rjson::find(table_info, "Tags")) {
f = add_tags(_proxy, schema, table_info);
f = add_tags(_mm, _proxy, schema, table_info);
}
return f.then([table_info = std::move(table_info), schema] () mutable {
return f.then([] {
return wait_for_schema_agreement(db::timeout_clock::now() + 10s);
}).then([table_info = std::move(table_info), schema] () mutable {
rjson::value status = rjson::empty_object();
supplement_table_info(table_info, *schema);
rjson::set(status, "TableDescription", std::move(table_info));
@@ -933,15 +971,24 @@ class attribute_collector {
void add(bytes&& name, atomic_cell&& cell) {
collected.emplace(std::move(name), std::move(cell));
}
void add(const bytes& name, atomic_cell&& cell) {
collected.emplace(name, std::move(cell));
}
public:
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
}
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
}
void del(bytes&& name, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
}
void del(const bytes& name, api::timestamp_type ts) {
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
}
collection_mutation_description to_mut() {
collection_mutation_description ret;
for (auto&& e : collected) {
@@ -1021,7 +1068,7 @@ public:
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
// put_or_delete_item doesn't keep a reference to schema (so it can be
// moved between shards for LWT) so it needs to be given again to build():
mutation build(schema_ptr schema, api::timestamp_type ts);
mutation build(schema_ptr schema, api::timestamp_type ts) const;
const partition_key& pk() const { return _pk; }
const clustering_key& ck() const { return _ck; }
};
@@ -1050,7 +1097,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
}
}
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
mutation m(schema, _pk);
// If there's no clustering key, a tombstone should be created directly
// on a partition, not on a clustering row - otherwise it will look like
@@ -1072,7 +1119,7 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
for (auto& c : *_cells) {
const column_definition* cdef = schema->get_column_definition(c.column_name);
if (!cdef) {
attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
attrs_collector.put(c.column_name, c.value, ts);
} else {
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
}
@@ -1195,22 +1242,9 @@ rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(sch
const auto& tags = get_tags_of_table(schema);
auto it = tags.find(WRITE_ISOLATION_TAG_KEY);
if (it == tags.end() || it->second.empty()) {
// By default, fall back to always enforcing LWT
return write_isolation::LWT_ALWAYS;
}
switch (it->second[0]) {
case 'f':
return write_isolation::FORBID_RMW;
case 'a':
return write_isolation::LWT_ALWAYS;
case 'o':
return write_isolation::LWT_RMW_ONLY;
case 'u':
return write_isolation::UNSAFE_RMW;
default:
// In case of an incorrect tag, fall back to the safest option: LWT_ALWAYS
return write_isolation::LWT_ALWAYS;
return default_write_isolation;
}
return parse_write_isolation(it->second);
}
// shard_for_execute() checks whether execute() must be called on a specific
@@ -1241,11 +1275,6 @@ std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_
// PutItem, DeleteItem). All these return nothing by default, but can
// optionally return Attributes if requested via the ReturnValues option.
static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes) {
// As an optimization, in the simple and common case that nothing is to be
// returned, quickly return an empty result:
if (attributes.IsNull()) {
return make_ready_future<executor::request_return_type>(json_string(""));
}
rjson::value ret = rjson::empty_object();
if (!attributes.IsNull()) {
rjson::set(ret, "Attributes", std::move(attributes));
@@ -1261,7 +1290,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
stats& stats) {
if (needs_read_before_write) {
if (_write_isolation == write_isolation::FORBID_RMW) {
throw api_error("ValidationException", "Read-modify-write operations not supported");
throw api_error("ValidationException", "Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
}
stats.reads_before_write++;
if (_write_isolation == write_isolation::UNSAFE_RMW) {
@@ -1370,7 +1399,7 @@ public:
check_needs_read_before_write(_condition_expression) ||
_returnvalues == returnvalues::ALL_OLD;
}
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
std::unordered_set<std::string> used_attribute_values;
std::unordered_set<std::string> used_attribute_names;
if (!verify_expected(_request, previous_item) ||
@@ -1382,6 +1411,7 @@ public:
// efficient than throwing an exception.
return {};
}
_return_attributes = {};
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
// previous_item is supposed to have been created with
// describe_item(), so has the "Item" attribute:
@@ -1448,7 +1478,7 @@ public:
check_needs_read_before_write(_condition_expression) ||
_returnvalues == returnvalues::ALL_OLD;
}
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
std::unordered_set<std::string> used_attribute_values;
std::unordered_set<std::string> used_attribute_names;
if (!verify_expected(_request, previous_item) ||
@@ -1460,6 +1490,7 @@ public:
// efficient than throwing an exception.
return {};
}
_return_attributes = {};
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
rjson::value* item = rjson::find(*previous_item, "Item");
if (item) {
@@ -1543,7 +1574,7 @@ public:
virtual ~put_or_delete_item_cas_request() = default;
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
std::optional<mutation> ret;
for (put_or_delete_item& mutation_builder : _mutation_builders) {
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
// We assume all these builders have the same partition.
if (ret) {
ret->apply(mutation_builder.build(schema, ts));
@@ -2367,7 +2398,7 @@ public:
update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
virtual ~update_item_operation() = default;
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
bool needs_read_before_write() const;
};
@@ -2431,7 +2462,7 @@ update_item_operation::needs_read_before_write() const {
}
std::optional<mutation>
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
std::unordered_set<std::string> used_attribute_values;
std::unordered_set<std::string> used_attribute_names;
if (!verify_expected(_request, previous_item) ||
@@ -2811,6 +2842,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
[] (std::vector<std::tuple<std::string, std::optional<rjson::value>>> responses) {
rjson::value response = rjson::empty_object();
rjson::set(response, "Responses", rjson::empty_object());
rjson::set(response, "UnprocessedKeys", rjson::empty_object());
for (auto& t : responses) {
if (!response["Responses"].HasMember(std::get<0>(t).c_str())) {
rjson::set_with_string_name(response["Responses"], std::get<0>(t), rjson::empty_array());
@@ -3080,7 +3112,7 @@ static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_d
if (attrs.Size() != 1) {
throw api_error("ValidationException", format("Only a single attribute is allowed for a hash key restriction: {}", attrs));
}
bytes raw_value = pk_cdef.type->from_string(attrs[0][type_to_string(pk_cdef.type)].GetString());
bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef);
partition_key pk = partition_key::from_singular(*schema, pk_cdef.type->deserialize(raw_value));
auto decorated_key = dht::decorate_key(*schema, pk);
if (op != comparison_operator_type::EQ) {
@@ -3105,7 +3137,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
if (attrs.Size() != expected_attrs_size) {
throw api_error("ValidationException", format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs));
}
bytes raw_value = ck_cdef.type->from_string(attrs[0][type_to_string(ck_cdef.type)].GetString());
bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef);
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
switch (op) {
case comparison_operator_type::EQ:
@@ -3119,7 +3151,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
case comparison_operator_type::GT:
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false));
case comparison_operator_type::BETWEEN: {
bytes raw_upper_limit = ck_cdef.type->from_string(attrs[1][type_to_string(ck_cdef.type)].GetString());
bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef);
clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit);
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit));
}
@@ -3132,9 +3164,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
throw api_error("ValidationException", format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
}
std::string raw_upper_limit_str = attrs[0][type_to_string(ck_cdef.type)].GetString();
bytes raw_upper_limit = ck_cdef.type->from_string(raw_upper_limit_str);
return get_clustering_range_for_begins_with(std::move(raw_upper_limit), ck, schema, ck_cdef.type);
return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
}
default:
throw api_error("ValidationException", format("Unknown primary key bound passed: {}", int(op)));

View File

@@ -63,6 +63,10 @@ public:
static write_isolation get_write_isolation_for_schema(schema_ptr schema);
static write_isolation default_write_isolation;
public:
static void set_default_write_isolation(std::string_view mode);
protected:
// The full request JSON
rjson::value _request;
@@ -83,7 +87,11 @@ protected:
// When _returnvalues != NONE, apply() should store here, in JSON form,
// the values which are to be returned in the "Attributes" field.
// The default null JSON means do not return an Attributes field at all.
rjson::value _return_attributes;
// This field is marked "mutable" so that the const apply() can modify
// it (see explanation below), but note that because apply() may be
// called more than once, if apply() will sometimes set this field it
// must set it (even if just to the default empty value) every time.
mutable rjson::value _return_attributes;
public:
// The constructor of a rmw_operation subclass should parse the request
// and try to discover as many input errors as it can before really
@@ -96,7 +104,12 @@ public:
// conditional expression, apply() should return an empty optional.
// apply() may throw if it encounters input errors not discovered during
// the constructor.
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
// apply() may be called more than once in case of contention, so it must
// not change the state saved in the object (issue #7218 was caused by
// violating this). We mark apply() "const" to let the compiler validate
// this for us. The output-only field _return_attributes is marked
// "mutable" above so that apply() can still write to it.
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
// Convert the above apply() into the signature needed by cas_request:
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
virtual ~rmw_operation() = default;

View File

@@ -54,26 +54,22 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
}
static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
std::vector<ss::token_range> res;
for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
ss::token_range r;
r.start_token = d._start_token;
r.end_token = d._end_token;
r.endpoints = d._endpoints;
r.rpc_endpoints = d._rpc_endpoints;
for (auto det : d._endpoint_details) {
ss::endpoint_detail ed;
ed.host = det._host;
ed.datacenter = det._datacenter;
if (det._rack != "") {
ed.rack = det._rack;
}
r.endpoint_details.push(ed);
static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
ss::token_range r;
r.start_token = d._start_token;
r.end_token = d._end_token;
r.endpoints = d._endpoints;
r.rpc_endpoints = d._rpc_endpoints;
for (auto det : d._endpoint_details) {
ss::endpoint_detail ed;
ed.host = det._host;
ed.datacenter = det._datacenter;
if (det._rack != "") {
ed.rack = det._rack;
}
res.push_back(r);
r.endpoint_details.push(ed);
}
return res;
return r;
}
using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
@@ -192,13 +188,13 @@ void set_storage_service(http_context& ctx, routes& r) {
return make_ready_future<json::json_return_type>(res);
});
ss::describe_any_ring.set(r, [&ctx](const_req req) {
return describe_ring("");
ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
});
ss::describe_ring.set(r, [&ctx](const_req req) {
auto keyspace = validate_keyspace(ctx, req.param);
return describe_ring(keyspace);
ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
auto keyspace = validate_keyspace(ctx, req->param);
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
});
ss::get_host_id_map.set(r, [&ctx](const_req req) {
@@ -273,8 +269,8 @@ void set_storage_service(http_context& ctx, routes& r) {
for (auto cf : column_families) {
column_families_vec.push_back(&db.find_column_family(keyspace, cf));
}
return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
return cm.perform_cleanup(cf);
return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
return cm.perform_cleanup(db, cf);
});
}).then([]{
return make_ready_future<json::json_return_type>(0);

View File

@@ -40,7 +40,8 @@ static dht::token to_token(int64_t value) {
}
static dht::token to_token(bytes_view key) {
if (key.empty()) {
// Key should be 16 B long, of which first 8 B are used for token calculation
if (key.size() != 2*sizeof(int64_t)) {
return dht::minimum_token();
}
return to_token(stream_id::token_from_bytes(key));

View File

@@ -130,7 +130,7 @@ bool should_propose_first_generation(const gms::inet_address& me, const gms::gos
*/
future<db_clock::time_point> get_local_streams_timestamp();
/* Generate a new set of CDC streams and insert it into the distributed cdc_topology_description table.
/* Generate a new set of CDC streams and insert it into the distributed cdc_generations table.
* Returns the timestamp of this new generation.
*
* Should be called when starting the node for the first time (i.e., joining the ring).
@@ -159,9 +159,9 @@ db_clock::time_point make_new_cdc_generation(
std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
/* Inform CDC users about a generation of streams (identified by the given timestamp)
* by inserting it into the cdc_description table.
* by inserting it into the cdc_streams table.
*
* Assumes that the cdc_topology_description table contains this generation.
* Assumes that the cdc_generations table contains this generation.
*
* Returning from this function does not mean that the table update was successful: the function
* might run an asynchronous task in the background.

View File

@@ -1146,7 +1146,7 @@ public:
if (r.row().deleted_at()) {
touched_parts.set<stats::part_type::ROW_DELETE>();
cdc_op = operation::row_delete;
if (pirow) {
if (pirow && pikey) {
for (const column_definition& column: _schema->regular_columns()) {
assert(pirow->has(column.name_as_text()));
auto& cdef = *_log_schema->get_column_definition(log_data_column_name_bytes(column.name()));

View File

@@ -140,6 +140,9 @@ public:
uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
reader_consumer make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer end_consumer);
// Returns whether or not interposer consumer is used by a given strategy.
bool use_interposer_consumer() const;
};
// Creates a compaction_strategy object from one of the strategies available.

View File

@@ -381,6 +381,7 @@ scylla_tests = set([
'test/boost/view_schema_ckey_test',
'test/boost/vint_serialization_test',
'test/boost/virtual_reader_test',
'test/boost/stall_free_test',
'test/manual/ec2_snitch_test',
'test/manual/gce_snitch_test',
'test/manual/gossip',
@@ -540,6 +541,7 @@ scylla_core = (['database.cc',
'sstables/compaction_strategy.cc',
'sstables/size_tiered_compaction_strategy.cc',
'sstables/leveled_compaction_strategy.cc',
'sstables/time_window_compaction_strategy.cc',
'sstables/compaction_manager.cc',
'sstables/integrity_checked_file_impl.cc',
'sstables/prepended_input_stream.cc',
@@ -1265,9 +1267,9 @@ def query_seastar_flags(pc_file, link_static_cxx=False):
return cflags, libs
for mode in build_modes:
seastar_cflags, seastar_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
modes[mode]['seastar_cflags'] = seastar_cflags
modes[mode]['seastar_libs'] = seastar_libs
seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
modes[mode]['seastar_cflags'] = seastar_pc_cflags
modes[mode]['seastar_libs'] = seastar_pc_libs
# We need to use experimental features of the zstd library (to use our own allocators for the (de)compression context),
# which are available only when the library is linked statically.
@@ -1288,6 +1290,46 @@ def configure_zstd(build_dir, mode):
os.makedirs(zstd_build_dir, exist_ok=True)
subprocess.check_call(zstd_cmd, shell=False, cwd=zstd_build_dir)
def configure_abseil(build_dir, mode):
abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
cmake_mode = MODE_TO_CMAKE_BUILD_TYPE[mode]
abseil_cmake_args = [
'-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
'-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
'-DCMAKE_C_COMPILER={}'.format(args.cc),
'-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
'-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
]
abseil_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('abseil', abseil_build_dir)] + abseil_cmake_args
os.makedirs(abseil_build_dir, exist_ok=True)
subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
abseil_libs = ['absl/' + lib for lib in [
'container/libabsl_hashtablez_sampler.a',
'container/libabsl_raw_hash_set.a',
'synchronization/libabsl_synchronization.a',
'synchronization/libabsl_graphcycles_internal.a',
'debugging/libabsl_stacktrace.a',
'debugging/libabsl_symbolize.a',
'debugging/libabsl_debugging_internal.a',
'debugging/libabsl_demangle_internal.a',
'time/libabsl_time.a',
'time/libabsl_time_zone.a',
'numeric/libabsl_int128.a',
'hash/libabsl_city.a',
'hash/libabsl_hash.a',
'base/libabsl_malloc_internal.a',
'base/libabsl_spinlock_wait.a',
'base/libabsl_base.a',
'base/libabsl_dynamic_annotations.a',
'base/libabsl_raw_logging_internal.a',
'base/libabsl_exponential_biased.a',
'base/libabsl_throw_delegate.a']]
args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
args.user_cflags += ' -march=' + args.target
libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
@@ -1318,6 +1360,7 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
for pkg in pkgs:
args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
libs += ' ' + pkg_config(pkg, '--libs')
args.user_cflags += '-I abseil'
user_cflags = args.user_cflags + ' -fvisibility=hidden'
user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
if args.staticcxx:
@@ -1348,6 +1391,9 @@ else:
for mode in build_modes:
configure_zstd(outdir, mode)
for mode in build_modes:
configure_abseil(outdir, mode)
# configure.py may run automatically from an already-existing build.ninja.
# If the user interrupts configure.py in the middle, we need build.ninja
# to remain in a valid state. So we write our output to a temporary
@@ -1485,6 +1531,8 @@ with open(buildfile_tmp, 'w') as f:
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
'libdeflate/libdeflate.a',
'zstd/lib/libzstd.a',
] + [
'abseil/' + x for x in abseil_libs
]])
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
if binary in tests:
@@ -1638,6 +1686,12 @@ with open(buildfile_tmp, 'w') as f:
f.write(' subdir = build/{mode}/zstd\n'.format(**locals()))
f.write(' target = libzstd.a\n'.format(**locals()))
for lib in abseil_libs:
f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
f.write(' pool = submodule_pool\n')
f.write(' subdir = build/{mode}/abseil\n'.format(**locals()))
f.write(' target = {lib}\n'.format(**locals()))
mode = 'dev' if 'dev' in modes else modes[0]
f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))

View File

@@ -267,10 +267,13 @@ public:
}
};
/// The same as `impl_max_function_for' but without knowledge of `Type'.
/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
class impl_max_dynamic_function final : public aggregate_function::aggregate {
data_type _io_type;
opt_bytes _max;
public:
impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
virtual void reset() override {
_max = {};
}
@@ -278,12 +281,11 @@ public:
return _max.value_or(bytes{});
}
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
if (!values[0]) {
if (values.empty() || !values[0]) {
return;
}
const auto val = *values[0];
if (!_max || *_max < val) {
_max = val;
if (!_max || _io_type->less(*_max, *values[0])) {
_max = values[0];
}
}
};
@@ -298,10 +300,13 @@ public:
};
class max_dynamic_function final : public native_aggregate_function {
data_type _io_type;
public:
max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
max_dynamic_function(data_type io_type)
: native_aggregate_function("max", io_type, { io_type })
, _io_type(std::move(io_type)) {}
virtual std::unique_ptr<aggregate> new_aggregate() override {
return std::make_unique<impl_max_dynamic_function>();
return std::make_unique<impl_max_dynamic_function>(_io_type);
}
};
@@ -358,10 +363,13 @@ public:
}
};
/// The same as `impl_min_function_for' but without knowledge of `Type'.
/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
class impl_min_dynamic_function final : public aggregate_function::aggregate {
data_type _io_type;
opt_bytes _min;
public:
impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
virtual void reset() override {
_min = {};
}
@@ -369,12 +377,11 @@ public:
return _min.value_or(bytes{});
}
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
if (!values[0]) {
if (values.empty() || !values[0]) {
return;
}
const auto val = *values[0];
if (!_min || val < *_min) {
_min = val;
if (!_min || _io_type->less(*values[0], *_min)) {
_min = values[0];
}
}
};
@@ -389,10 +396,13 @@ public:
};
class min_dynamic_function final : public native_aggregate_function {
data_type _io_type;
public:
min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
min_dynamic_function(data_type io_type)
: native_aggregate_function("min", io_type, { io_type })
, _io_type(std::move(io_type)) {}
virtual std::unique_ptr<aggregate> new_aggregate() override {
return std::make_unique<impl_min_dynamic_function>();
return std::make_unique<impl_min_dynamic_function>(_io_type);
}
};

View File

@@ -88,16 +88,13 @@ static data_value castas_fctn_simple(data_value from) {
template<typename ToType>
static data_value castas_fctn_from_decimal_to_float(data_value from) {
auto val_from = value_cast<big_decimal>(from);
boost::multiprecision::cpp_int ten(10);
boost::multiprecision::cpp_rational r = val_from.unscaled_value();
r /= boost::multiprecision::pow(ten, val_from.scale());
return static_cast<ToType>(r);
return static_cast<ToType>(val_from.as_rational());
}
static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
const auto& val_from = value_cast<big_decimal>(from);
boost::multiprecision::cpp_int ten(10);
return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
auto r = val_from.as_rational();
return utils::multiprecision_int(numerator(r)/denominator(r));
}
template<typename ToType>

View File

@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
collection_mutation_description mut;
mut.cells.reserve(1);
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
if (!value) {
mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
} else {
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
}
m.set_cell(prefix, column, mut.serialize(*ltype));
}

View File

@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
::shared_ptr<single_column_restriction> restr;
if (single_pk_restrs) {
if (single_ck_restrs) {
auto it = single_ck_restrs->restrictions().find(cdef);
if (it != single_ck_restrs->restrictions().end()) {
restr = dynamic_pointer_cast<single_column_restriction>(it->second);
@@ -688,6 +688,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
extract_bound(statements::bound::END));
}
static bool contains_without_wraparound(
const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
return !range.is_wrap_around(cmp) && range.contains(value, cmp);
}
bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
const partition_key& key,
const clustering_key_prefix& ckey,
@@ -702,13 +707,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
return false;
}
return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
return to_range(_slice, options, _column_def.name_as_text()).contains(
return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
cell_value_bv, _column_def.type->as_tri_comparator());
});
}
bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
return to_range(_slice, options, _column_def.name_as_text()).contains(
return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
data, _column_def.type->underlying_type()->as_tri_comparator());
}

View File

@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
"because a collection with the same name and a different type has already been used in the past", column_name));
}
}
if (type->is_counter() && !schema.is_counter()) {
throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
}
cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);
@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
schema_builder builder(view);
if (view->view_info()->include_all_columns()) {
builder.with_column(column_name.name(), type);
} else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
} else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
db::view::create_virtual_column(builder, column_name.name(), type);
}
view_updates.push_back(view_ptr(builder.build()));

View File

@@ -68,6 +68,7 @@ batch_statement::batch_statement(int bound_terms, type type_,
, _has_conditions(boost::algorithm::any_of(_statements, [] (auto&& s) { return s.statement->has_conditions(); }))
, _stats(stats)
{
validate();
if (has_conditions()) {
// A batch can be created not only by raw::batch_statement::prepare, but also by
// cql_server::connection::process_batch, which doesn't call any methods of
@@ -448,7 +449,6 @@ batch_statement::prepare(database& db, cql_stats& stats) {
prep_attrs->collect_marker_specification(bound_names);
cql3::statements::batch_statement batch_statement_(bound_names.size(), _type, std::move(statements), std::move(prep_attrs), stats);
batch_statement_.validate();
std::vector<uint16_t> partition_key_bind_indices;
if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {

View File

@@ -113,11 +113,11 @@ make_flush_controller(const db::config& cfg, seastar::scheduling_group sg, const
inline
std::unique_ptr<compaction_manager>
make_compaction_manager(const db::config& cfg, database_config& dbcfg, abort_source& as) {
make_compaction_manager(const db::config& cfg, database_config& dbcfg) {
if (cfg.compaction_static_shares() > 0) {
return std::make_unique<compaction_manager>(dbcfg.compaction_scheduling_group, service::get_local_compaction_priority(), dbcfg.available_memory, cfg.compaction_static_shares(), as);
return std::make_unique<compaction_manager>(dbcfg.compaction_scheduling_group, service::get_local_compaction_priority(), dbcfg.available_memory, cfg.compaction_static_shares());
}
return std::make_unique<compaction_manager>(dbcfg.compaction_scheduling_group, service::get_local_compaction_priority(), dbcfg.available_memory, as);
return std::make_unique<compaction_manager>(dbcfg.compaction_scheduling_group, service::get_local_compaction_priority(), dbcfg.available_memory);
}
lw_shared_ptr<keyspace_metadata>
@@ -161,7 +161,7 @@ void keyspace::remove_user_type(const user_type ut) {
utils::UUID database::empty_version = utils::UUID_gen::get_name_UUID(bytes{});
database::database(const db::config& cfg, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, locator::token_metadata& tm, abort_source& as)
database::database(const db::config& cfg, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, locator::token_metadata& tm)
: _stats(make_lw_shared<db_stats>())
, _cl_stats(std::make_unique<cell_locker_stats>())
, _cfg(cfg)
@@ -198,7 +198,7 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
, _mutation_query_stage()
, _apply_stage("db_apply", &database::do_apply)
, _version(empty_version)
, _compaction_manager(make_compaction_manager(_cfg, dbcfg, as))
, _compaction_manager(make_compaction_manager(_cfg, dbcfg))
, _enable_incremental_backups(cfg.incremental_backups())
, _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
, _large_data_handler(std::make_unique<db::cql_table_large_data_handler>(_cfg.compaction_large_partition_warning_threshold_mb()*1024*1024,
@@ -1324,7 +1324,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
// counter state for each modified cell...
tracing::trace(trace_state, "Reading counter values from the CF");
return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state)
return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state, timeout)
.then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
// ...now, that we got existing state of all affected counter
// cells we can look for our shard in each of them, increment
@@ -1823,7 +1823,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
// TODO: indexes.
// Note: since discard_sstables was changed to only count tables owned by this shard,
// we can get zero rp back. Changed assert, and ensure we save at least low_mark.
assert(low_mark <= rp || rp == db::replay_position());
// #6995 - the assert below was broken in c2c6c71 and remained so for many years.
// We nowadays do not flush tables with sstables but autosnapshot=false. This means
// the low_mark assertion does not hold, because we maybe/probably never got around to
// creating the sstables that would create them.
assert(!should_flush || low_mark <= rp || rp == db::replay_position());
rp = std::max(low_mark, rp);
return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
// save_truncation_record() may actually fail after we cached the truncation time
@@ -2001,9 +2005,10 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
reader_concurrency_semaphore* semaphore;
};
distributed<database>& _db;
utils::UUID _table_id;
std::vector<reader_context> _contexts;
public:
explicit streaming_reader_lifecycle_policy(distributed<database>& db) : _db(db), _contexts(smp::count) {
streaming_reader_lifecycle_policy(distributed<database>& db, utils::UUID table_id) : _db(db), _table_id(table_id), _contexts(smp::count) {
}
virtual flat_mutation_reader create_reader(
schema_ptr schema,
@@ -2032,7 +2037,12 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
});
}
virtual reader_concurrency_semaphore& semaphore() override {
return *_contexts[this_shard_id()].semaphore;
const auto shard = this_shard_id();
if (!_contexts[shard].semaphore) {
auto& cf = _db.local().find_column_family(_table_id);
_contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();
}
return *_contexts[shard].semaphore;
}
};
auto ms = mutation_source([&db] (schema_ptr s,
@@ -2043,7 +2053,8 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
tracing::trace_state_ptr trace_state,
streamed_mutation::forwarding,
mutation_reader::forwarding fwd_mr) {
return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db), std::move(s), pr, ps, pc,
auto table_id = s->id();
return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db, table_id), std::move(s), pr, ps, pc,
std::move(trace_state), fwd_mr);
});
auto&& full_slice = schema->full_slice();

View File

@@ -55,6 +55,7 @@
#include <limits>
#include <cstddef>
#include "schema_fwd.hh"
#include "db/view/view.hh"
#include "db/schema_features.hh"
#include "gms/feature.hh"
#include "timestamp.hh"
@@ -903,7 +904,7 @@ public:
lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
std::vector<sstables::shared_sstable> candidates_for_compaction() const;
std::vector<sstables::shared_sstable> non_staging_sstables() const;
std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
size_t sstables_count() const;
std::vector<uint64_t> sstable_count_per_level() const;
@@ -1008,8 +1009,9 @@ public:
return *_config.sstables_manager;
}
// Reader's schema must be the same as the base schema of each of the views.
future<> populate_views(
std::vector<view_ptr>,
std::vector<db::view::view_and_base>,
dht::token base_token,
flat_mutation_reader&&);
@@ -1026,7 +1028,7 @@ private:
const io_priority_class& io_priority, query::partition_slice::option_set custom_opts) const;
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
future<> generate_and_propagate_view_updates(const schema_ptr& base,
std::vector<view_ptr>&& views,
std::vector<db::view::view_and_base>&& views,
mutation&& m,
flat_mutation_reader_opt existings) const;
@@ -1427,7 +1429,7 @@ public:
void set_enable_incremental_backups(bool val) { _enable_incremental_backups = val; }
future<> parse_system_tables(distributed<service::storage_proxy>&, distributed<service::migration_manager>&);
database(const db::config&, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, locator::token_metadata& tm, abort_source& as);
database(const db::config&, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, locator::token_metadata& tm);
database(database&&) = delete;
~database();

View File

@@ -290,7 +290,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
mutation m(schema, key);
auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
return _qp.proxy().mutate_locally(m);
return _qp.proxy().mutate_locally(m, db::commitlog::force_sync::no);
});
};

View File

@@ -521,7 +521,7 @@ public:
_segment_manager->totals.total_size_on_disk -= size_on_disk();
_segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
_segment_manager->add_file_to_delete(_file_name, _desc);
} else {
} else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
clogger.warn("Segment {} is dirty and is left on disk.", *this);
}
}

View File

@@ -137,6 +137,7 @@ public:
bool reuse_segments = true;
bool use_o_dsync = false;
bool warn_about_segments_left_on_disk_after_shutdown = true;
const db::extensions * extensions = nullptr;
};

View File

@@ -299,7 +299,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
fm.partition().accept(cm, v);
return do_with(std::move(m), [&db, &cf] (mutation m) {
return do_with(std::move(m), [&db, &cf] (const mutation& m) {
return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
});
} else {

View File

@@ -681,7 +681,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
, replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
, override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
, ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
, shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
, fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
@@ -736,6 +736,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port")
, alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address")
, alternator_enforce_authorization(this, "alternator_enforce_authorization", value_status::Used, false, "Enforce checking the authorization header for every request in Alternator")
, alternator_write_isolation(this, "alternator_write_isolation", value_status::Used, "", "Default write isolation policy for Alternator")
, abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.")
, redis_port(this, "redis_port", value_status::Used, 0, "Port on which the REDIS transport listens for clients.")
, redis_ssl_port(this, "redis_ssl_port", value_status::Used, 0, "Port on which the REDIS TLS native transport listens for clients.")

View File

@@ -314,6 +314,8 @@ public:
named_value<uint16_t> alternator_https_port;
named_value<sstring> alternator_address;
named_value<bool> alternator_enforce_authorization;
named_value<sstring> alternator_write_isolation;
named_value<bool> abort_on_ebadf;
named_value<uint16_t> redis_port;

View File

@@ -61,7 +61,8 @@ namespace db {
logging::logger cl_logger("consistency");
size_t quorum_for(const keyspace& ks) {
return (ks.get_replication_strategy().get_replication_factor() / 2) + 1;
size_t replication_factor = ks.get_replication_strategy().get_replication_factor();
return replication_factor ? (replication_factor / 2) + 1 : 0;
}
size_t local_quorum_for(const keyspace& ks, const sstring& dc) {
@@ -72,8 +73,8 @@ size_t local_quorum_for(const keyspace& ks, const sstring& dc) {
if (rs.get_type() == replication_strategy_type::network_topology) {
const network_topology_strategy* nrs =
static_cast<const network_topology_strategy*>(&rs);
return (nrs->get_replication_factor(dc) / 2) + 1;
size_t replication_factor = nrs->get_replication_factor(dc);
return replication_factor ? (replication_factor / 2) + 1 : 0;
}
return quorum_for(ks);

View File

@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
with_lock(file_update_mutex(), [this] {
if (_hints_store_anchor) {
hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
return tmp->shutdown().finally([tmp] {});
return tmp->shutdown().finally([tmp] {
return tmp->release();
}).finally([tmp] {});
}
return make_ready_future<>();
}).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
// HH doesn't utilize the flow that benefits from reusing segments.
// Therefore let's simply disable it to avoid any possible confusion.
cfg.reuse_segments = false;
// HH leaves segments on disk after commitlog shutdown, and later reads
// them when commitlog is re-created. This is expected to happen regularly
// during standard HH workload, so no need to print a warning about it.
cfg.warn_about_segments_left_on_disk_after_shutdown = false;
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
return futurize_invoke([this] {
return with_lock(file_update_mutex(), [this]() -> future<> {
return get_or_load().then([] (hints_store_ptr cptr) {
return cptr->shutdown();
return cptr->shutdown().finally([cptr] {
return cptr->release();
}).finally([cptr] {});
}).then([this] {
// Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
// other hints_store_ptr copies and this would destroy the commitlog shared value.
@@ -703,6 +711,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
// Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
// (last_modification - manager::hints_timer_period) old.
if (gc_clock::now().time_since_epoch() - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
ctx_ptr->rps_set.erase(rp);
return make_ready_future<>();
}
@@ -725,6 +734,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
++this->shard_stats().discarded;
}
ctx_ptr->rps_set.erase(rp);
return make_ready_future<>();
}).finally([units = std::move(units), ctx_ptr] {});
}).handle_exception([this, ctx_ptr] (auto eptr) {
@@ -810,7 +820,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
int replayed_segments_count = 0;
try {
while (replay_allowed() && have_segments()) {
while (replay_allowed() && have_segments() && can_send()) {
if (!send_one_file(*_segments_to_replay.begin())) {
break;
}

View File

@@ -822,6 +822,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
});
}
future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
return merge_lock().then([&proxy, &feat] {
return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
}).finally([] {
return merge_unlock();
});
}
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
{
return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
@@ -2905,10 +2913,6 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
// format, where "token" is not marked as computed. Once we're sure that all indexes have their
// columns marked as computed (because they were either created on a node that supports computed
// columns or were fixed by this utility function), it's safe to remove this function altogether.
if (!db.features().cluster_supports_computed_columns()) {
return make_ready_future<>();
}
if (v->clustering_key_size() == 0) {
return make_ready_future<>();
}

View File

@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);
// Recalculates the local schema version and publishes it in gossip.
//
// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
// of feature_service and schema tables.
future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);
std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);

View File

@@ -72,7 +72,7 @@ schema_ptr view_build_status() {
}
/* An internal table used by nodes to exchange CDC generation data. */
schema_ptr cdc_topology_description() {
schema_ptr cdc_generations() {
thread_local auto schema = [] {
auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION);
return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION, {id})
@@ -108,7 +108,7 @@ schema_ptr cdc_desc() {
static std::vector<schema_ptr> all_tables() {
return {
view_build_status(),
cdc_topology_description(),
cdc_generations(),
cdc_desc(),
};
}
@@ -204,7 +204,7 @@ future<> system_distributed_keyspace::remove_view(sstring ks_name, sstring view_
false).discard_result();
}
/* We want to make sure that writes/reads to/from cdc_topology_description and cdc_description
/* We want to make sure that writes/reads to/from cdc_generations and cdc_streams
* are consistent: a read following an acknowledged write to the same partition should contact
* at least one of the replicas that the write contacted.
* Normally we would achieve that by always using CL = QUORUM,

View File

@@ -48,10 +48,10 @@ public:
static constexpr auto VIEW_BUILD_STATUS = "view_build_status";
/* Nodes use this table to communicate new CDC stream generations to other nodes. */
static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_topology_description";
static constexpr auto CDC_TOPOLOGY_DESCRIPTION = "cdc_generations";
/* This table is used by CDC clients to learn about avaliable CDC streams. */
static constexpr auto CDC_DESC = "cdc_description";
static constexpr auto CDC_DESC = "cdc_streams";
/* Information required to modify/query some system_distributed tables, passed from the caller. */
struct context {

View File

@@ -130,17 +130,90 @@ const column_definition* view_info::view_column(const column_definition& base_de
return _schema.get_column_definition(base_def.name());
}
const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
void view_info::set_base_info(db::view::base_info_ptr base_info) {
_base_info = std::move(base_info);
}
// A constructor for a base info that can facilitate reads and writes from the materialized view.
db::view::base_dependent_view_info::base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk)
: _base_schema{std::move(base_schema)}
, _base_non_pk_columns_in_view_pk{std::move(base_non_pk_columns_in_view_pk)}
, has_base_non_pk_columns_in_view_pk{!_base_non_pk_columns_in_view_pk.empty()}
, use_only_for_reads{false} {
}
// A constructor for a base info that can facilitate only reads from the materialized view.
db::view::base_dependent_view_info::base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk)
: _base_schema{nullptr}
, has_base_non_pk_columns_in_view_pk{has_base_non_pk_columns_in_view_pk}
, use_only_for_reads{true} {
}
const std::vector<column_id>& db::view::base_dependent_view_info::base_non_pk_columns_in_view_pk() const {
if (use_only_for_reads) {
on_internal_error(vlogger, "base_non_pk_columns_in_view_pk(): operation unsupported when initialized only for view reads.");
}
return _base_non_pk_columns_in_view_pk;
}
void view_info::initialize_base_dependent_fields(const schema& base) {
const schema_ptr& db::view::base_dependent_view_info::base_schema() const {
if (use_only_for_reads) {
on_internal_error(vlogger, "base_schema(): operation unsupported when initialized only for view reads.");
}
return _base_schema;
}
db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
std::vector<column_id> base_non_pk_columns_in_view_pk;
bool has_base_non_pk_columns_in_view_pk = false;
bool can_only_read_from_view = false;
for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
if (view_col.is_computed()) {
// we are not going to find it in the base table...
continue;
}
auto* base_col = base.get_column_definition(view_col.name());
if (base_col && !base_col->is_primary_key()) {
_base_non_pk_columns_in_view_pk.push_back(base_col->id);
base_non_pk_columns_in_view_pk.push_back(base_col->id);
has_base_non_pk_columns_in_view_pk = true;
} else if (!base_col) {
// If we didn't find the column in the base column then it must have been deleted
// or not yet added (by alter command), this means it is for sure not a pk column
// in the base table. This can happen if the version of the base schema is not the
// one that the view was created with. Seting this schema as the base can't harm since
// if we got to such a situation then it means it is only going to be used for reading
// (computation of shadowable tombstones) and in that case the existence of such a column
// is the only thing that is of interest to us.
has_base_non_pk_columns_in_view_pk = true;
can_only_read_from_view = true;
// We can break the loop here since we have the info we wanted and the list
// of columns is not going to be reliable anyhow.
break;
}
}
if (can_only_read_from_view) {
return make_lw_shared<db::view::base_dependent_view_info>(has_base_non_pk_columns_in_view_pk);
} else {
return make_lw_shared<db::view::base_dependent_view_info>(base.shared_from_this(), std::move(base_non_pk_columns_in_view_pk));
}
}
bool view_info::has_base_non_pk_columns_in_view_pk() const {
// The base info is not always available, this is because
// the base info initialization is separate from the view
// info construction. If we are trying to get this info without
// initializing the base information it means that we have a
// schema integrity problem as the creator of owning view schema
// didn't make sure to initialize it with base information.
if (!_base_info) {
on_internal_error(vlogger, "Tried to perform a view query which is base info dependant without initializing it");
}
return _base_info->has_base_non_pk_columns_in_view_pk;
}
namespace db {
@@ -188,11 +261,11 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
}
static bool update_requires_read_before_write(const schema& base,
const std::vector<view_ptr>& views,
const std::vector<view_and_base>& views,
const dht::decorated_key& key,
const rows_entry& update) {
for (auto&& v : views) {
view_info& vf = *v->view_info();
view_info& vf = *v.view->view_info();
if (may_be_affected_by(base, vf, key, update)) {
return true;
}
@@ -239,12 +312,14 @@ class view_updates final {
view_ptr _view;
const view_info& _view_info;
schema_ptr _base;
base_info_ptr _base_info;
std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
public:
explicit view_updates(view_ptr view, schema_ptr base)
: _view(std::move(view))
explicit view_updates(view_and_base vab)
: _view(std::move(vab.view))
, _view_info(*_view->view_info())
, _base(std::move(base))
, _base(vab.base->base_schema())
, _base_info(vab.base)
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
}
@@ -306,7 +381,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
// they share liveness information. It's true especially in the only case currently allowed by CQL,
// which assumes there's up to one non-pk column in the view key. It's also true in alternator,
// which does not carry TTL information.
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
if (!col_ids.empty()) {
auto& def = _base->regular_column_at(col_ids[0]);
// Note: multi-cell columns can't be part of the primary key.
@@ -537,7 +612,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster
void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
auto& r = get_view_row(base_key, existing);
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
if (!col_ids.empty()) {
// We delete the old row using a shadowable row tombstone, making sure that
// the tombstone deletes everything in the row (or it might still show up).
@@ -678,7 +753,7 @@ void view_updates::generate_update(
return;
}
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
if (col_ids.empty()) {
// The view key is necessarily the same pre and post update.
if (existing && existing->is_live(*_base)) {
@@ -932,11 +1007,16 @@ future<stop_iteration> view_update_builder::on_results() {
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
const schema_ptr& base,
std::vector<view_ptr>&& views_to_update,
std::vector<view_and_base>&& views_to_update,
flat_mutation_reader&& updates,
flat_mutation_reader_opt&& existings) {
auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
return view_updates(std::move(v), base);
auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
if (base->version() != v.base->base_schema()->version()) {
on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
" base schema version of the view ({}) for view {}.{} of {}.{}",
base->version(), v.base->base_schema()->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
}
return view_updates(std::move(v));
}));
auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings));
auto f = builder->build();
@@ -946,18 +1026,18 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
const dht::decorated_key& key,
const mutation_partition& mp,
const std::vector<view_ptr>& views) {
const std::vector<view_and_base>& views) {
std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
clustering_key_prefix_view::tri_compare cmp(base);
if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
for (auto&& v : views) {
// FIXME: #2371
if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
break;
}
for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
}
}
@@ -1720,7 +1800,7 @@ public:
return stop_iteration::yes;
}
_fragments_memory_usage += cr.memory_usage(*_step.base->schema());
_fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
_fragments.push_back(std::move(cr));
if (_fragments_memory_usage > batch_memory_max) {
// Although we have not yet completed the batch of base rows that
@@ -1740,10 +1820,14 @@ public:
_builder._as.check();
if (!_fragments.empty()) {
_fragments.push_front(partition_start(_step.current_key, tombstone()));
auto base_schema = _step.base->schema();
auto views = with_base_info_snapshot(_views_to_build);
auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
reader.upgrade_schema(base_schema);
_step.base->populate_views(
_views_to_build,
std::move(views),
_step.current_token(),
make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
std::move(reader)).get();
_fragments.clear();
_fragments_memory_usage = 0;
}
@@ -1890,5 +1974,11 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
});
}
std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
return db::view::view_and_base{v, v->view_info()->base_info()};
}));
}
} // namespace view
} // namespace db

View File

@@ -43,6 +43,46 @@ namespace db {
namespace view {
// Part of the view description which depends on the base schema version.
//
// This structure may change even though the view schema doesn't change, so
// it needs to live outside view_ptr.
struct base_dependent_view_info {
private:
schema_ptr _base_schema;
// Id of a regular base table column included in the view's PK, if any.
// Scylla views only allow one such column, alternator can have up to two.
std::vector<column_id> _base_non_pk_columns_in_view_pk;
public:
const std::vector<column_id>& base_non_pk_columns_in_view_pk() const;
const schema_ptr& base_schema() const;
// Indicates if the view hase pk columns which are not part of the base
// pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
// but actually there are cases where we can compute this boolean without
// succeeding to reliably build the former.
const bool has_base_non_pk_columns_in_view_pk;
// If base_non_pk_columns_in_view_pk couldn't reliably be built, this base
// info can't be used for computing view updates, only for reading the materialized
// view.
const bool use_only_for_reads;
// A constructor for a base info that can facilitate reads and writes from the materialized view.
base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk);
// A constructor for a base info that can facilitate only reads from the materialized view.
base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk);
};
// Immutable snapshot of view's base-schema-dependent part.
using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
// Snapshot of the view schema and its base-schema-dependent part.
struct view_and_base {
view_ptr view;
base_info_ptr base;
};
/**
* Whether the view filter considers the specified partition key.
*
@@ -92,7 +132,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
const schema_ptr& base,
std::vector<view_ptr>&& views_to_update,
std::vector<view_and_base>&& views_to_update,
flat_mutation_reader&& updates,
flat_mutation_reader_opt&& existings);
@@ -100,7 +140,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
const schema& base,
const dht::decorated_key& key,
const mutation_partition& mp,
const std::vector<view_ptr>& views);
const std::vector<view_and_base>& views);
struct wait_for_all_updates_tag {};
using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
@@ -128,6 +168,13 @@ future<> mutate_MV(
*/
void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);
/**
* Converts a collection of view schema snapshots into a collection of
* view_and_base objects, which are snapshots of both the view schema
* and the base-schema-dependent part of view description.
*/
std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
}
}

View File

@@ -36,19 +36,25 @@ future<> view_update_generator::start() {
_pending_sstables.wait().get();
}
// To ensure we don't race with updates, move the entire content
// into a local variable.
auto sstables_with_tables = std::exchange(_sstables_with_tables, {});
// If we got here, we will process all tables we know about so far eventually so there
// is no starvation
for (auto& t : _sstables_with_tables | boost::adaptors::map_keys) {
for (auto table_it = sstables_with_tables.begin(); table_it != sstables_with_tables.end(); table_it = sstables_with_tables.erase(table_it)) {
auto& [t, sstables] = *table_it;
schema_ptr s = t->schema();
// Copy what we have so far so we don't miss new updates
auto sstables = std::exchange(_sstables_with_tables[t], {});
vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), sstables.size());
const auto num_sstables = sstables.size();
try {
// temporary: need an sstable set for the flat mutation reader, but the
// compaction_descriptor takes a vector. Soon this will become a compaction
// so the transformation to the SSTable set will not be needed.
auto ssts = make_lw_shared(t->get_compaction_strategy().make_sstable_set(s));
// Exploit the fact that sstables in the staging directory
// are usually non-overlapping and use a partitioned set for
// the read.
auto ssts = make_lw_shared(sstables::make_partitioned_sstable_set(s, make_lw_shared<sstable_list>(sstable_list{}), false));
for (auto& sst : sstables) {
ssts->insert(sst);
}
@@ -81,7 +87,7 @@ future<> view_update_generator::start() {
// Move from staging will be retried upon restart.
vug_logger.warn("Moving {} from staging failed: {}:{}. Ignoring...", s->ks_name(), s->cf_name(), std::current_exception());
}
_registration_sem.signal();
_registration_sem.signal(num_sstables);
}
// For each table, move the processed staging sstables into the table's base dir.
for (auto it = _sstables_to_move.begin(); it != _sstables_to_move.end(); ) {

View File

@@ -32,7 +32,10 @@
namespace db::view {
class view_update_generator {
public:
static constexpr size_t registration_queue_size = 5;
private:
database& _db;
seastar::abort_source _as;
future<> _started = make_ready_future<>();
@@ -51,6 +54,8 @@ public:
future<> start();
future<> stop();
future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
ssize_t available_register_units() const { return _registration_sem.available_units(); }
private:
bool should_throttle() const;
};

View File

@@ -43,16 +43,29 @@
#include "log.hh"
#include "db/config.hh"
#include "database.hh"
#include "streaming/stream_reason.hh"
static logging::logger blogger("boot_strapper");
namespace dht {
future<> boot_strapper::bootstrap() {
future<> boot_strapper::bootstrap(streaming::stream_reason reason) {
blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _abort_source, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_gossiper().get_unreachable_members()));
sstring description;
if (reason == streaming::stream_reason::bootstrap) {
description = "Bootstrap";
} else if (reason == streaming::stream_reason::replace) {
description = "Replace";
} else {
return make_exception_future<>(std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap"));
}
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _abort_source, _tokens, _address, description, reason);
auto nodes_to_filter = gms::get_local_gossiper().get_unreachable_members();
if (reason == streaming::stream_reason::replace && _db.local().get_replace_address()) {
nodes_to_filter.insert(_db.local().get_replace_address().value());
}
blogger.debug("nodes_to_filter={}", nodes_to_filter);
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(nodes_to_filter));
auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
auto& ks = _db.local().find_keyspace(keyspace_name);

View File

@@ -41,6 +41,7 @@
#include "dht/i_partitioner.hh"
#include <unordered_set>
#include "database_fwd.hh"
#include "streaming/stream_reason.hh"
#include <seastar/core/distributed.hh>
#include <seastar/core/abort_source.hh>
@@ -66,7 +67,7 @@ public:
, _token_metadata(tmd) {
}
future<> bootstrap();
future<> bootstrap(streaming::stream_reason reason);
/**
* if initialtoken was specified, use that (split on comma).

View File

@@ -91,7 +91,16 @@ range_streamer::get_range_fetch_map(const std::unordered_map<dht::token_range, s
}
if (!found_source) {
throw std::runtime_error(format("unable to find sufficient sources for streaming range {} in keyspace {}", range_, keyspace));
auto& ks = _db.local().find_keyspace(keyspace);
auto rf = ks.get_replication_strategy().get_replication_factor();
// When a replacing node replaces a dead node with keyspace of RF
// 1, it is expected that replacing node could not find a peer node
// that contains data to stream from.
if (_reason == streaming::stream_reason::replace && rf == 1) {
logger.warn("Unable to find sufficient sources to stream range {} for keyspace {} with RF = 1 for replace operation", range_, keyspace);
} else {
throw std::runtime_error(format("unable to find sufficient sources for streaming range {} in keyspace {}", range_, keyspace));
}
}
}

View File

@@ -146,7 +146,7 @@ private:
* here, we always exclude ourselves.
* @return
*/
static std::unordered_map<inet_address, dht::token_range_vector>
std::unordered_map<inet_address, dht::token_range_vector>
get_range_fetch_map(const std::unordered_map<dht::token_range, std::vector<inet_address>>& ranges_with_sources,
const std::unordered_set<std::unique_ptr<i_source_filter>>& source_filters,
const sstring& keyspace);

View File

@@ -28,7 +28,8 @@ namespace query {
enum class digest_algorithm : uint8_t {
none = 0, // digest not required
MD5 = 1,
xxHash = 2,// default algorithm
legacy_xxHash_without_null_digest = 2,
xxHash = 3, // default algorithm
};
}

View File

@@ -36,7 +36,7 @@ struct noop_hasher {
};
class digester final {
std::variant<noop_hasher, md5_hasher, xx_hasher> _impl;
std::variant<noop_hasher, md5_hasher, xx_hasher, legacy_xx_hasher_without_null_digest> _impl;
public:
explicit digester(digest_algorithm algo) {
@@ -47,6 +47,9 @@ public:
case digest_algorithm::xxHash:
_impl = xx_hasher();
break;
case digest_algorithm::legacy_xxHash_without_null_digest:
_impl = legacy_xx_hasher_without_null_digest();
break;
case digest_algorithm ::none:
_impl = noop_hasher();
break;

View File

@@ -42,6 +42,11 @@ if __name__ == '__main__':
if node_exporter_p.exists() or (bindir_p() / 'prometheus-node_exporter').exists():
if force:
print('node_exporter already installed, reinstalling')
try:
node_exporter = systemd_unit('node-exporter.service')
node_exporter.stop()
except:
pass
else:
print('node_exporter already installed, you can use `--force` to force reinstallation')
sys.exit(1)

View File

@@ -61,7 +61,15 @@ def sh_command(*args):
return out
def get_url(path):
return urllib.request.urlopen(path).read().decode('utf-8')
# If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
# When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
# from get_json_from_url.
# In order to see legit error we catch it from the inside of process, covert to string and
# pass it as part of return value
try:
return 0, urllib.request.urlopen(path).read().decode('utf-8')
except Exception as exc:
return 1, str(exc)
def get_json_from_url(path):
pool = mp.Pool(processes=1)
@@ -71,13 +79,16 @@ def get_json_from_url(path):
# to enforce a wallclock timeout.
result = pool.apply_async(get_url, args=(path,))
try:
retval = result.get(timeout=5)
status, retval = result.get(timeout=5)
except mp.TimeoutError as err:
pool.terminate()
pool.join()
raise
if status == 1:
raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
return json.loads(retval)
def get_api(path):
return get_json_from_url("http://" + api_address + path)

View File

@@ -65,8 +65,8 @@ Before=scylla-server.service
After=local-fs.target
[Mount]
What=/var/lib/systemd/coredump
Where=/var/lib/scylla/coredump
What=/var/lib/scylla/coredump
Where=/var/lib/systemd/coredump
Type=none
Options=bind
@@ -78,6 +78,7 @@ WantedBy=multi-user.target
makedirs('/var/lib/scylla/coredump')
systemd_unit.reload()
systemd_unit('var-lib-systemd-coredump.mount').enable()
systemd_unit('var-lib-systemd-coredump.mount').start()
if os.path.exists('/usr/lib/sysctl.d/50-coredump.conf'):
run('sysctl -p /usr/lib/sysctl.d/50-coredump.conf')
else:
@@ -99,6 +100,14 @@ WantedBy=multi-user.target
try:
run('coredumpctl --no-pager --no-legend info {}'.format(pid))
print('\nsystemd-coredump is working finely.')
# get last coredump generated by bash and remove it, ignore inaccessaible ones
corefile = out(cmd=r'coredumpctl -1 --no-legend dump 2>&1 | grep "bash" | '
r'grep -v "inaccessible" | grep "Storage:\|Coredump:"',
shell=True, exception=False)
if corefile:
corefile = corefile.split()[-1]
run('rm -f {}'.format(corefile))
except subprocess.CalledProcessError as e:
print('Does not able to detect coredump, failed to configure systemd-coredump.')
sys.exit(1)

View File

@@ -374,6 +374,9 @@ if __name__ == '__main__':
if not stat.S_ISBLK(os.stat(dsk).st_mode):
print('{} is not block device'.format(dsk))
continue
if dsk in selected:
print(f'{dsk} is already added')
continue
selected.append(dsk)
devices.remove(dsk)
disks = ','.join(selected)

View File

@@ -96,7 +96,7 @@ def curl(url, byte=False):
return res.read()
else:
return res.read().decode('utf-8')
except urllib.error.HTTPError:
except urllib.error.URLError:
logging.warn("Failed to grab %s..." % url)
time.sleep(5)
retries += 1
@@ -182,7 +182,7 @@ class aws_instance:
instance_size = self.instance_size()
if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
return 'ixgbevf'
if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'i3en', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
return 'ena'
if instance_class == 'm4':
if instance_size == '16xlarge':
@@ -219,7 +219,7 @@ class aws_instance:
def ebs_disks(self):
"""Returns all EBS disks"""
return set(self._disks["ephemeral"])
return set(self._disks["ebs"])
def public_ipv4(self):
"""Returns the public IPv4 address of this instance"""
@@ -327,9 +327,7 @@ class scylla_cpuinfo:
return len(self._cpu_data["system"])
# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
scylla_env = os.environ.copy()
scylla_env['PATH'] = '{}:{}'.format(scylla_env['PATH'], scyllabindir())
def run(cmd, shell=False, silent=False, exception=True):
stdout = subprocess.DEVNULL if silent else None
@@ -441,6 +439,19 @@ def dist_ver():
return platform.dist()[1]
SYSTEM_PARTITION_UUIDS = [
'21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
'024dee41-33e7-11d3-9d69-0008c781f39f' # MBR partition scheme
]
def get_partition_uuid(dev):
return out(f'lsblk -n -oPARTTYPE {dev}')
def is_system_partition(dev):
uuid = get_partition_uuid(dev)
return (uuid in SYSTEM_PARTITION_UUIDS)
def is_unused_disk(dev):
# dev is not in /sys/class/block/, like /dev/nvme[0-9]+
if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
@@ -448,7 +459,8 @@ def is_unused_disk(dev):
try:
fd = os.open(dev, os.O_EXCL)
os.close(fd)
return True
# dev is not reserved for system
return not is_system_partition(dev)
except OSError:
return False

View File

@@ -0,0 +1,4 @@
# allocate enough inotify instances for large machines
# each tls instance needs 1 inotify instance, and there can be
# multiple tls instances per shard.
fs.inotify.max_user_instances = 1200

View File

@@ -5,6 +5,7 @@ Section: database
Priority: optional
X-Python3-Version: >= 3.4
Standards-Version: 3.9.5
Rules-Requires-Root: no
Package: {{product}}-conf
Architecture: any

View File

@@ -11,6 +11,7 @@ else
sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
fi
#DEBHELPER#

View File

@@ -5,6 +5,7 @@ Section: python
Priority: optional
X-Python3-Version: >= 3.4
Standards-Version: 3.9.5
Rules-Requires-Root: no
Package: {{product}}-python3
Architecture: amd64

View File

@@ -37,6 +37,7 @@ override_dh_strip:
# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
# already stripped, nothing is lost if we exclude them, so that's what we do.
dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package={{product}}-server-dbg
find $(CURDIR)/debian/{{product}}-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;
override_dh_makeshlibs:

View File

@@ -18,6 +18,8 @@ def parse():
parser.add_argument('--api-address', default=None, dest='apiAddress')
parser.add_argument('--alternator-address', default=None, dest='alternatorAddress', help="Alternator API address to listen to. Defaults to listen address.")
parser.add_argument('--alternator-port', default=None, dest='alternatorPort', help="Alternator API port to listen to. Disabled by default.")
parser.add_argument('--alternator-https-port', default=None, dest='alternatorHttpsPort', help="Alternator API TLS port to listen to. Disabled by default.")
parser.add_argument('--alternator-write-isolation', default=None, dest='alternatorWriteIsolation', help="Alternator default write isolation policy.")
parser.add_argument('--disable-version-check', default=False, action='store_true', dest='disable_housekeeping', help="Disable version check")
parser.add_argument('--authenticator', default=None, dest='authenticator', help="Set authenticator class")
parser.add_argument('--authorizer', default=None, dest='authorizer', help="Set authorizer class")

View File

@@ -16,6 +16,8 @@ class ScyllaSetup:
self._broadcastRpcAddress = arguments.broadcastRpcAddress
self._apiAddress = arguments.apiAddress
self._alternatorPort = arguments.alternatorPort
self._alternatorHttpsPort = arguments.alternatorHttpsPort
self._alternatorWriteIsolation = arguments.alternatorWriteIsolation
self._smp = arguments.smp
self._memory = arguments.memory
self._overprovisioned = arguments.overprovisioned
@@ -116,6 +118,13 @@ class ScyllaSetup:
args += ["--alternator-address %s" % self._alternatorAddress]
args += ["--alternator-port %s" % self._alternatorPort]
if self._alternatorHttpsPort is not None:
args += ["--alternator-address %s" % self._alternatorAddress]
args += ["--alternator-https-port %s" % self._alternatorHttpsPort]
if self._alternatorWriteIsolation is not None:
args += ["--alternator-write-isolation %s" % self._alternatorWriteIsolation]
if self._authenticator is not None:
args += ["--authenticator %s" % self._authenticator]

View File

@@ -130,10 +130,9 @@ rm -rf $RPM_BUILD_ROOT
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
%ghost /etc/systemd/system/scylla-helper.slice.d/
%ghost /etc/systemd/system/scylla-helper.slice.d/memory.conf
%ghost /etc/systemd/system/scylla-server.service.d/
%ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
%ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
%ghost /etc/systemd/system/scylla-server.service.d/dependencies.conf
/etc/systemd/system/scylla-server.service.d/dependencies.conf
%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
%package conf
@@ -189,6 +188,8 @@ Summary: Scylla configuration package for the Linux kernel
License: AGPLv3
URL: http://www.scylladb.com/
Requires: kmod
# tuned overwrites our sysctl settings
Obsoletes: tuned
%description kernel-conf
This package contains Linux kernel configuration changes for the Scylla database. Install this package
@@ -200,6 +201,7 @@ if Scylla is the main application on your server and you wish to optimize its la
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
%files kernel-conf
%defattr(-,root,root)

View File

@@ -25,6 +25,14 @@ By default, Scylla listens on this port on all network interfaces.
To listen only on a specific interface, pass also an "`alternator-address`"
option.
As we explain below in the "Write isolation policies", Alternator has
four different choices for the implementation of writes, each with
different advantages. You should consider which of the options makes
more sense for your intended use case, and use the "`--alternator-write-isolation`"
option to choose one. There is currently no default for this option: Trying
to run Scylla with Alternator enabled without passing this option will
result in an error asking you to set it.
DynamoDB clients usually specify a single "endpoint" address, e.g.,
`dynamodb.us-east-1.amazonaws.com`, and a DNS server hosted on that address
distributes the connections to many different backend nodes. Alternator
@@ -108,12 +116,15 @@ implemented, with the following limitations:
Writes are done in LOCAL_QURUM and reads in LOCAL_ONE (eventual consistency)
or LOCAL_QUORUM (strong consistency).
### Global Tables
* Currently, *all* Alternator tables are created as "Global Tables", i.e., can
be accessed from all of Scylla's DCs.
* We do not yet support the DynamoDB API calls to make some of the tables
global and others local to a particular DC: CreateGlobalTable,
UpdateGlobalTable, DescribeGlobalTable, ListGlobalTables,
UpdateGlobalTableSettings, DescribeGlobalTableSettings, and UpdateTable.
* Currently, *all* Alternator tables are created as "global" tables and can
be accessed from all the DCs existing at the time of the table's creation.
If a DC is added after a table is created, the table won't be visible from
the new DC and changing that requires a CQL "ALTER TABLE" statement to
modify the table's replication strategy.
* We do not yet support the DynamoDB API calls that control which table is
visible from what DC: CreateGlobalTable, UpdateGlobalTable,
DescribeGlobalTable, ListGlobalTables, UpdateGlobalTableSettings,
DescribeGlobalTableSettings, and UpdateTable.
### Backup and Restore
* On-demand backup: the DynamoDB APIs are not yet supported: CreateBackup,
DescribeBackup, DeleteBackup, ListBackups, RestoreTableFromBackup.
@@ -153,23 +164,28 @@ implemented, with the following limitations:
### Write isolation policies
DynamoDB API update requests may involve a read before the write - e.g., a
_conditional_ update, or an update based on the old value of an attribute.
_conditional_ update or an update based on the old value of an attribute.
The read and the write should be treated as a single transaction - protected
(_isolated_) from other parallel writes to the same item.
By default, Alternator does this isolation by using Scylla's LWT (lightweight
transactions) for every write operation. However, LWT significantly slows
writes down, so Alternator supports three additional _write isolation
policies_, which can be chosen on a per-table basis and may make sense for
certain workloads as explained below.
Alternator could do this isolation by using Scylla's LWT (lightweight
transactions) for every write operation, but this significantly slows
down writes, and not necessary for workloads which don't use read-modify-write
(RMW) updates.
The write isolation policy of a table is configured by tagging the table (at
CreateTable time, or any time later with TagResource) with the key
So Alternator supports four _write isolation policies_, which can be chosen
on a per-table basis and may make sense for certain workloads as explained
below.
A default write isolation policy **must** be chosen using the
`--alternator-write-isolation` configuration option. Additionally, the write
isolation policy for a specific table can be overriden by tagging the table
(at CreateTable time, or any time later with TagResource) with the key
`system:write_isolation`, and one of the following values:
* `a`, `always`, or `always_use_lwt` - This is the default choice.
It performs every write operation - even those that do not need a read
before the write - as a lightweight transaction.
* `a`, `always`, or `always_use_lwt` - This mode performs every write
operation - even those that do not need a read before the write - as a
lightweight transaction.
This is the slowest choice, but also the only choice guaranteed to work
correctly for every workload.

View File

@@ -10,10 +10,16 @@ This section will guide you through the steps for setting up the cluster:
nightly image by running: `docker pull scylladb/scylla-nightly:latest`
2. Follow the steps in the [Scylla official download web page](https://www.scylladb.com/download/open-source/#docker)
add to every "docker run" command: `-p 8000:8000` before the image name
and `--alternator-port=8000` at the end. The "alternator-port" option
specifies on which port Scylla will listen for the (unencrypted) DynamoDB API.
and `--alternator-port=8000 --alternator-write-isolation=always` at the end.
The "alternator-port" option specifies on which port Scylla will listen for
the (unencrypted) DynamoDB API, and the "alternator-write-isolation" chooses
whether or not Alternator will use LWT for every write.
For example,
`docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000
`docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000 --alternator-write-isolation=always
The `--alternator-https-port=...` option can also be used to enable
Alternator on an encrypted (HTTPS) port. Note that in this case, the files
`/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` must be inserted into
the image, containing the SSL certificate and key to use.
## Testing Scylla's DynamoDB API support:
### Running AWS Tic Tac Toe demo app to test the cluster:

View File

@@ -92,7 +92,7 @@ Shard-colocation is an optimization.
Having different generations operating at different points in time is necessary to maintain colocation in presence of topology changes. When a new node joins the cluster it modifies the token ring by refining existing vnodes into smaller vnodes. But before it does it, it will introduce a new CDC generation whose token ranges refine those new (smaller) vnodes (which means they also refine the old vnodes; that way writes will be colocated on both old and new replicas).
The joining node learns about the current vnodes, chooses tokens which will split them into smaller vnodes and creates a new `cdc::topology_description` which refines those smaller vnodes. This is done in the `cdc::generate_topology_description` function. It then inserts the generation description into an internal distributed table `cdc_topology_description` in the `system_distributed` keyspace. The table is defined as follows (from db/system_distributed_keyspace.cc):
The joining node learns about the current vnodes, chooses tokens which will split them into smaller vnodes and creates a new `cdc::topology_description` which refines those smaller vnodes. This is done in the `cdc::generate_topology_description` function. It then inserts the generation description into an internal distributed table `cdc_generations` in the `system_distributed` keyspace. The table is defined as follows (from db/system_distributed_keyspace.cc):
```
return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_TOPOLOGY_DESCRIPTION, {id})
/* The timestamp of this CDC generation. */
@@ -131,11 +131,11 @@ Next, the node starts gossiping the timestamp of the new generation together wit
}).get();
```
When other nodes learn about the generation, they'll extract it from the `cdc_topology_description` table and save it using `cdc::metadata::insert(db_clock::time_point, topology_description&&)`.
When other nodes learn about the generation, they'll extract it from the `cdc_generations` table and save it using `cdc::metadata::insert(db_clock::time_point, topology_description&&)`.
Notice that nodes learn about the generation together with the new node's tokens. When they learn about its tokens they'll immediately start sending writes to the new node (in the case of bootstrapping, it will become a pending replica). But the old generation will still be operating for a minute or two. Thus colocation will be lost for a while. This problem will be fixed when the two-phase-commit approach is implemented.
We're not able to prevent a node learning about a new generation too late due to a network partition: if gossip doesn't reach the node in time, some writes might be sent to the wrong (old) generation.
However, it could happen that a node learns about the generation from gossip in time, but then won't be able to extract it from `cdc_topology_description`. In that case we can still maintain consistency: the node will remember that there is a new generation even though it doesn't yet know what it is (just the timestamp) using the `cdc::metadata::prepare(db_clock::time_point)` method, and then _reject_ writes for CDC-enabled tables that are supposed to use this new generation. The node will keep trying to read the generation's data in background until it succeeds or sees that it's not necessary anymore (e.g. because the generation was already superseded by a new generation).
However, it could happen that a node learns about the generation from gossip in time, but then won't be able to extract it from `cdc_generations`. In that case we can still maintain consistency: the node will remember that there is a new generation even though it doesn't yet know what it is (just the timestamp) using the `cdc::metadata::prepare(db_clock::time_point)` method, and then _reject_ writes for CDC-enabled tables that are supposed to use this new generation. The node will keep trying to read the generation's data in background until it succeeds or sees that it's not necessary anymore (e.g. because the generation was already superseded by a new generation).
Thus we give up availability for safety. This likely won't happen if the administrator ensures that the cluster is not partitioned before bootstrapping a new node. This problem will also be mitigated with a future patch.
Due to the need of maintaining colocation we don't allow the client to send writes with arbitrary timestamps.
@@ -144,7 +144,7 @@ Reason: we cannot allow writes before `T`, because they belong to the old genera
### Streams description table
The `cdc_description` table in the `system_distributed` keyspace allows CDC clients to learn about available sets of streams and the time intervals they are operating at. It's definition is as follows (db/system_distributed_keyspace.cc):
The `cdc_streams` table in the `system_distributed` keyspace allows CDC clients to learn about available sets of streams and the time intervals they are operating at. It's definition is as follows (db/system_distributed_keyspace.cc):
```
return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::CDC_DESC, {id})
/* The timestamp of this CDC generation. */
@@ -161,9 +161,9 @@ where
thread_local data_type cdc_stream_tuple_type = tuple_type_impl::get_instance({long_type, long_type});
thread_local data_type cdc_streams_set_type = set_type_impl::get_instance(cdc_stream_tuple_type, false);
```
This table simply contains each generation's timestamp (as partition key) and the set of stream IDs used by this generation. It is meant to be user-facing, in contrast to `cdc_topology_description` which is used internally.
This table simply contains each generation's timestamp (as partition key) and the set of stream IDs used by this generation. It is meant to be user-facing, in contrast to `cdc_generations` which is used internally.
When nodes learn about a CDC generation through gossip, they race to update the description table by inserting a proper row (see `cdc::update_streams_description`). This operation is idempotent so it doesn't matter if multiple nodes do it at the same time.
#### TODO: expired generations
The `expired` column in `cdc_description` and `cdc_topology_description` means that this generation was superseded by some new generation and will soon be removed (its table entry will be gone). This functionality is yet to be implemented.
The `expired` column in `cdc_streams` and `cdc_generations` means that this generation was superseded by some new generation and will soon be removed (its table entry will be gone). This functionality is yet to be implemented.

View File

@@ -163,6 +163,20 @@ $ docker run --name some-scylla -d scylladb/scylla --alternator-port 8000
**Since: 3.2**
### `--alternator-https-port PORT`
The `--alternator-https-port` option is similar to `--alternator-port`, just enables an encrypted (HTTPS) port. Either the `--alternator-https-port` or `--alternator-http-port`, or both, can be used to enable Alternator.
Note that the `--alternator-https-port` option also requires that files `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` be inserted into the image. These files contain an SSL certificate and key, respectively.
**Since: 4.2**
### `--alternator-write-isolation policy`
The `--alternator-write-isolation` command line option chooses between four allowed write isolation policies described in docs/alternator/alternator.md. This option must be specified if Alternator is enabled - it does not have a default.
**Since: 4.1**
### `--broadcast-address ADDR`
The `--broadcast-address` command line option configures the IP address the Scylla instance tells other Scylla nodes in the cluster to connect to.

View File

@@ -488,6 +488,9 @@ public:
size_t buffer_size() const {
return _impl->buffer_size();
}
const circular_buffer<mutation_fragment>& buffer() const {
return _impl->buffer();
}
// Detach the internal buffer of the reader.
// Roughly equivalent to depleting it by calling pop_mutation_fragment()
// until is_buffer_empty() returns true.

View File

@@ -140,6 +140,7 @@ extern const std::string_view NONFROZEN_UDTS;
extern const std::string_view HINTED_HANDOFF_SEPARATE_CONNECTION;
extern const std::string_view LWT;
extern const std::string_view PER_TABLE_PARTITIONERS;
extern const std::string_view DIGEST_FOR_NULL_VALUES;
}

View File

@@ -55,6 +55,7 @@ constexpr std::string_view features::NONFROZEN_UDTS = "NONFROZEN_UDTS";
constexpr std::string_view features::HINTED_HANDOFF_SEPARATE_CONNECTION = "HINTED_HANDOFF_SEPARATE_CONNECTION";
constexpr std::string_view features::LWT = "LWT";
constexpr std::string_view features::PER_TABLE_PARTITIONERS = "PER_TABLE_PARTITIONERS";
constexpr std::string_view features::DIGEST_FOR_NULL_VALUES = "DIGEST_FOR_NULL_VALUES";
static logging::logger logger("features");
@@ -88,8 +89,9 @@ feature_service::feature_service(feature_config cfg) : _config(cfg)
, _nonfrozen_udts(*this, features::NONFROZEN_UDTS)
, _hinted_handoff_separate_connection(*this, features::HINTED_HANDOFF_SEPARATE_CONNECTION)
, _lwt_feature(*this, features::LWT)
, _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS) {
}
, _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS)
, _digest_for_null_values_feature(*this, features::DIGEST_FOR_NULL_VALUES)
{}
feature_config feature_config_from_db_config(db::config& cfg) {
feature_config fcfg;
@@ -163,6 +165,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
gms::features::UNBOUNDED_RANGE_TOMBSTONES,
gms::features::HINTED_HANDOFF_SEPARATE_CONNECTION,
gms::features::PER_TABLE_PARTITIONERS,
gms::features::DIGEST_FOR_NULL_VALUES,
};
if (_config.enable_sstables_mc_format) {
@@ -254,6 +257,7 @@ void feature_service::enable(const std::set<std::string_view>& list) {
std::ref(_hinted_handoff_separate_connection),
std::ref(_lwt_feature),
std::ref(_per_table_partitioners_feature),
std::ref(_digest_for_null_values_feature),
})
{
if (list.count(f.name())) {

View File

@@ -96,6 +96,7 @@ private:
gms::feature _hinted_handoff_separate_connection;
gms::feature _lwt_feature;
gms::feature _per_table_partitioners_feature;
gms::feature _digest_for_null_values_feature;
public:
bool cluster_supports_range_tombstones() const {
@@ -166,6 +167,10 @@ public:
return _per_table_partitioners_feature;
}
const feature& cluster_supports_digest_for_null_values() const {
return _digest_for_null_values_feature;
}
bool cluster_supports_row_level_repair() const {
return bool(_row_level_repair_feature);
}

View File

@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
return make_ready_future<>();
}
return seastar::async([this, from] {
auto permit = this->lock_endpoint(from).get0();
this->mark_as_shutdown(from);
});
}

View File

@@ -175,6 +175,7 @@ public:
versioned_value::STATUS_LEFT,
versioned_value::HIBERNATE,
versioned_value::STATUS_BOOTSTRAPPING,
versioned_value::STATUS_UNKNOWN,
};
static constexpr std::chrono::milliseconds INTERVAL{1000};
static constexpr std::chrono::hours A_VERY_LONG_TIME{24 * 3};

View File

@@ -49,6 +49,7 @@ enum class stream_reason : uint8_t {
removenode,
rebuild,
repair,
replace,
};
enum class stream_mutation_fragments_cmd : uint8_t {

View File

@@ -132,6 +132,7 @@ relocate_python3() {
cp "$script" "$relocateddir"
cat > "$install"<<EOF
#!/usr/bin/env bash
export LC_ALL=en_US.UTF-8
x="\$(readlink -f "\$0")"
b="\$(basename "\$x")"
d="\$(dirname "\$x")"
@@ -143,7 +144,7 @@ DEBIAN_SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
if [ -f "\${DEBIAN_SSL_CERT_FILE}" ]; then
c=\${DEBIAN_SSL_CERT_FILE}
fi
PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
PYTHONPATH="\${d}:\${d}/libexec:\$PYTHONPATH" PATH="\${d}/../bin:\${d}/$pythonpath:\${PATH}" SSL_CERT_FILE="\${c}" exec -a "\$0" "\${d}/libexec/\${b}" "\$@"
EOF
chmod +x "$install"
}
@@ -378,5 +379,9 @@ elif ! $packaging; then
chown -R scylla:scylla $rdata
chown -R scylla:scylla $rhkdata
for file in dist/common/sysctl.d/*.conf; do
bn=$(basename "$file")
sysctl -p "$rusr"/lib/sysctl.d/"$bn"
done
$rprefix/scripts/scylla_post_install.sh
fi

203
licenses/abseil-license.txt Normal file
View File

@@ -0,0 +1,203 @@
Apache License
Version 2.0, January 2004
https://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -168,15 +168,33 @@ insert_token_range_to_sorted_container_while_unwrapping(
dht::token_range_vector
abstract_replication_strategy::get_ranges(inet_address ep) const {
return get_ranges(ep, _token_metadata);
return do_get_ranges(ep, _token_metadata, false);
}
dht::token_range_vector
abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
return do_get_ranges(ep, _token_metadata, true);
}
dht::token_range_vector
abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
return do_get_ranges(ep, tm, false);
}
dht::token_range_vector
abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
return do_get_ranges(ep, tm, true);
}
dht::token_range_vector
abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
dht::token_range_vector ret;
auto prev_tok = tm.sorted_tokens().back();
for (auto tok : tm.sorted_tokens()) {
for (inet_address a : calculate_natural_endpoints(tok, tm)) {
if (can_yield) {
seastar::thread::maybe_yield();
}
if (a == ep) {
insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
break;

View File

@@ -113,10 +113,15 @@ public:
// It the analogue of Origin's getAddressRanges().get(endpoint).
// This function is not efficient, and not meant for the fast path.
dht::token_range_vector get_ranges(inet_address ep) const;
dht::token_range_vector get_ranges_in_thread(inet_address ep) const;
// Use the token_metadata provided by the caller instead of _token_metadata
dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
private:
dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;
public:
// get_primary_ranges() returns the list of "primary ranges" for the given
// endpoint. "Primary ranges" are the ranges that the node is responsible
// for storing replica primarily, which means this is the first node

View File

@@ -1975,9 +1975,7 @@ void topology::add_endpoint(const inet_address& ep)
if (current->second.dc == dc && current->second.rack == rack) {
return;
}
_dc_racks[current->second.dc][current->second.rack].erase(ep);
_dc_endpoints[current->second.dc].erase(ep);
remove_endpoint(ep);
}
_dc_endpoints[dc].insert(ep);
@@ -2002,7 +2000,14 @@ void topology::remove_endpoint(inet_address ep)
}
_dc_endpoints[cur_dc_rack->second.dc].erase(ep);
_dc_racks[cur_dc_rack->second.dc][cur_dc_rack->second.rack].erase(ep);
auto& racks = _dc_racks[cur_dc_rack->second.dc];
auto& eps = racks[cur_dc_rack->second.rack];
eps.erase(ep);
if (eps.empty()) {
racks.erase(cur_dc_rack->second.rack);
}
_current_locations.erase(cur_dc_rack);
}

10
lua.cc
View File

@@ -264,14 +264,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {
template <typename Func>
static auto visit_decimal(const big_decimal &v, Func&& f) {
boost::multiprecision::cpp_int ten(10);
const auto& dividend = v.unscaled_value();
auto divisor = boost::multiprecision::pow(ten, v.scale());
boost::multiprecision::cpp_rational r = v.as_rational();
const boost::multiprecision::cpp_int& dividend = numerator(r);
const boost::multiprecision::cpp_int& divisor = denominator(r);
if (dividend % divisor == 0) {
return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
return f(utils::multiprecision_int(dividend/divisor));
}
boost::multiprecision::cpp_rational r = dividend;
r /= divisor;
return f(r.convert_to<double>());
}

24
main.cc
View File

@@ -78,6 +78,7 @@
#include "cdc/log.hh"
#include "cdc/cdc_extension.hh"
#include "alternator/tags_extension.hh"
#include "alternator/rmw_operation.hh"
namespace fs = std::filesystem;
@@ -736,7 +737,7 @@ int main(int ac, char** av) {
dbcfg.memtable_scheduling_group = make_sched_group("memtable", 1000);
dbcfg.memtable_to_cache_scheduling_group = make_sched_group("memtable_to_cache", 200);
dbcfg.available_memory = memory::stats().total_memory();
db.start(std::ref(*cfg), dbcfg, std::ref(mm_notifier), std::ref(feature_service), std::ref(token_metadata), std::ref(stop_signal.as_sharded_abort_source())).get();
db.start(std::ref(*cfg), dbcfg, std::ref(mm_notifier), std::ref(feature_service), std::ref(token_metadata)).get();
start_large_data_handler(db).get();
auto stop_database_and_sstables = defer_verbose_shutdown("database", [&db] {
// #293 - do not stop anything - not even db (for real)
@@ -820,6 +821,7 @@ int main(int ac, char** av) {
storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
spcfg.read_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
spcfg.write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
static db::view::node_update_backlog node_backlog(smp::count, 10ms);
scheduling_group_key_config storage_proxy_stats_cfg =
@@ -948,12 +950,16 @@ int main(int ac, char** av) {
mm.init_messaging_service();
}).get();
supervisor::notify("initializing storage proxy RPC verbs");
proxy.invoke_on_all([] (service::storage_proxy& p) {
p.init_messaging_service();
}).get();
proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
});
supervisor::notify("starting streaming service");
streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
streaming::stream_session::uninit_streaming_service().get();
});
api::set_server_stream_manager(ctx).get();
supervisor::notify("starting hinted handoff manager");
@@ -986,6 +992,9 @@ int main(int ac, char** av) {
rs.stop().get();
});
repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
repair_uninit_messaging_service_handler().get();
});
supervisor::notify("starting storage service", true);
auto& ss = service::get_local_storage_service();
ss.init_messaging_service_part().get();
@@ -1081,6 +1090,7 @@ int main(int ac, char** av) {
}
if (cfg->alternator_port() || cfg->alternator_https_port()) {
alternator::rmw_operation::set_default_write_isolation(cfg->alternator_write_isolation());
static sharded<alternator::executor> alternator_executor;
static sharded<alternator::server> alternator_server;
@@ -1186,6 +1196,12 @@ int main(int ac, char** av) {
}
});
auto stop_compaction_manager = defer_verbose_shutdown("compaction manager", [&db] {
db.invoke_on_all([](auto& db) {
return db.get_compaction_manager().stop();
}).get();
});
auto stop_redis_service = defer_verbose_shutdown("redis service", [&cfg] {
if (cfg->redis_port() || cfg->redis_ssl_port()) {
redis.stop().get();

View File

@@ -731,6 +731,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
}
future<> messaging_service::unregister_stream_mutation_fragments() {
return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
}
template<class SinkType, class SourceType>
future<rpc::sink<SinkType>, rpc::source<SourceType>>
do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
@@ -762,6 +766,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
}
future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
}
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
@@ -781,6 +788,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
}
future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
}
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
@@ -800,6 +810,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
}
future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
}
// Send a message for verb
template <typename MsgIn, typename... MsgOut>
@@ -883,6 +896,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
std::move(msg), plan_id, std::move(description), reason);
}
future<> messaging_service::unregister_prepare_message() {
return unregister_handler(messaging_verb::PREPARE_MESSAGE);
}
// PREPARE_DONE_MESSAGE
void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
@@ -892,6 +908,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
plan_id, dst_cpu_id);
}
future<> messaging_service::unregister_prepare_done_message() {
return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
}
// STREAM_MUTATION
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
@@ -916,6 +935,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
plan_id, std::move(ranges), cf_id, dst_cpu_id);
}
future<> messaging_service::unregister_stream_mutation_done() {
return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
}
// COMPLETE_MESSAGE
void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
@@ -925,6 +947,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
plan_id, dst_cpu_id, failed);
}
future<> messaging_service::unregister_complete_message() {
return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
}
void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
@@ -1139,14 +1164,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
}
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
}
future<> messaging_service::unregister_repair_get_full_row_hashes() {
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
}
future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
}
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
@@ -1171,13 +1196,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
}
// Wrapper for REPAIR_GET_ROW_DIFF
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
}
future<> messaging_service::unregister_repair_get_row_diff() {
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
}
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
}

View File

@@ -276,10 +276,12 @@ public:
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
sstring description, streaming::stream_reason);
future<> unregister_prepare_message();
// Wrapper for PREPARE_DONE_MESSAGE verb
void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
future<> unregister_prepare_done_message();
// Wrapper for STREAM_MUTATION verb
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
@@ -288,6 +290,7 @@ public:
// Wrapper for STREAM_MUTATION_FRAGMENTS
// The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
future<> unregister_stream_mutation_fragments();
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
@@ -295,22 +298,27 @@ public:
future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
future<> unregister_repair_get_row_diff_with_rpc_stream();
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
future<> unregister_repair_put_row_diff_with_rpc_stream();
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
future<> unregister_repair_get_full_row_hashes_with_rpc_stream();
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
future<> unregister_stream_mutation_done();
void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
future<> unregister_complete_message();
// Wrapper for REPAIR_CHECKSUM_RANGE verb
void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
@@ -318,9 +326,9 @@ public:
future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
future<> unregister_repair_get_full_row_hashes();
future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
@@ -333,9 +341,9 @@ public:
future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);
// Wrapper for REPAIR_GET_ROW_DIFF
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
future<> unregister_repair_get_row_diff();
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);
// Wrapper for REPAIR_PUT_ROW_DIFF
void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);

View File

@@ -195,6 +195,7 @@ class read_context : public reader_lifecycle_policy {
// One for each shard. Index is shard id.
std::vector<reader_meta> _readers;
std::vector<reader_concurrency_semaphore*> _semaphores;
gate _dismantling_gate;
@@ -211,7 +212,8 @@ public:
, _schema(std::move(s))
, _cmd(cmd)
, _ranges(ranges)
, _trace_state(std::move(trace_state)) {
, _trace_state(std::move(trace_state))
, _semaphores(smp::count, nullptr) {
_readers.resize(smp::count);
}
@@ -236,7 +238,12 @@ public:
virtual void destroy_reader(shard_id shard, future<stopped_reader> reader_fut) noexcept override;
virtual reader_concurrency_semaphore& semaphore() override {
return _readers[this_shard_id()].rparts->semaphore;
const auto shard = this_shard_id();
if (!_semaphores[shard]) {
auto& table = _db.local().find_column_family(_schema);
_semaphores[shard] = &table.read_concurrency_semaphore();
}
return *_semaphores[shard];
}
future<> lookup_readers();

View File

@@ -734,56 +734,78 @@ void write_counter_cell(RowWriter& w, const query::partition_slice& slice, ::ato
});
}
// Used to return the timestamp of the latest update to the row
struct max_timestamp {
api::timestamp_type max = api::missing_timestamp;
void update(api::timestamp_type ts) {
max = std::max(max, ts);
}
};
template<>
struct appending_hash<row> {
template<typename Hasher>
void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
for (auto id : columns) {
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
if (!cell_and_hash) {
return;
}
auto&& def = s.column_at(kind, id);
if (def.is_atomic()) {
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
if constexpr (query::using_hash_of_hash_v<Hasher>) {
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
query::default_hasher cellh;
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
feed_hash(h, cellh.finalize_uint64());
}
template<typename Hasher>
void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
for (auto id : columns) {
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
if (!cell_and_hash) {
feed_hash(h, appending_hash<row>::null_hash_value);
continue;
}
auto&& def = s.column_at(kind, id);
if (def.is_atomic()) {
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
if constexpr (query::using_hash_of_hash_v<Hasher>) {
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
query::default_hasher cellh;
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
feed_hash(h, cellh.finalize_uint64());
}
} else {
auto cm = cell_and_hash->cell.as_collection_mutation();
max_ts.update(cm.last_update(*def.type));
if constexpr (query::using_hash_of_hash_v<Hasher>) {
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
query::default_hasher cellh;
feed_hash(cellh, cm, def);
feed_hash(h, cellh.finalize_uint64());
}
feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
}
} else {
auto cm = cell_and_hash->cell.as_collection_mutation();
max_ts.update(cm.last_update(*def.type));
if constexpr (query::using_hash_of_hash_v<Hasher>) {
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
feed_hash(h, cm, def);
query::default_hasher cellh;
feed_hash(cellh, cm, def);
feed_hash(h, cellh.finalize_uint64());
}
} else {
feed_hash(h, cm, def);
}
}
}
};
}
// Instantiation for mutation_test.cc
template void appending_hash<row>::operator()<xx_hasher>(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
template<>
void appending_hash<row>::operator()<legacy_xx_hasher_without_null_digest>(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
for (auto id : columns) {
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
if (!cell_and_hash) {
return;
}
auto&& def = s.column_at(kind, id);
if (def.is_atomic()) {
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
query::default_hasher cellh;
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
feed_hash(h, cellh.finalize_uint64());
}
} else {
auto cm = cell_and_hash->cell.as_collection_mutation();
max_ts.update(cm.last_update(*def.type));
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
query::default_hasher cellh;
feed_hash(cellh, cm, def);
feed_hash(h, cellh.finalize_uint64());
}
}
}
}
cell_hash_opt row::cell_hash_for(column_id id) const {
if (_type == storage_type::vector) {
@@ -1721,7 +1743,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
// we erase the live cells according to the shadowable_tombstone rules.
static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
return s.is_view()
&& !s.view_info()->base_non_pk_columns_in_view_pk().empty()
&& s.view_info()->has_base_non_pk_columns_in_view_pk()
&& !marker.is_live()
&& kind == column_kind::regular_column; // not applicable to static rows
}
@@ -2505,7 +2527,8 @@ mutation_partition::fully_discontinuous(const schema& s, const position_range& r
future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& source,
const dht::decorated_key& dk,
const query::partition_slice& slice,
tracing::trace_state_ptr trace_ptr)
tracing::trace_state_ptr trace_ptr,
db::timeout_clock::time_point timeout)
{
struct range_and_reader {
dht::partition_range range;
@@ -2530,7 +2553,7 @@ future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& so
auto cwqrb = counter_write_query_result_builder(*s);
auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::yes, counter_write_query_result_builder>>(
*s, gc_clock::now(), slice, query::max_rows, query::max_rows, std::move(cwqrb));
auto f = r_a_r->reader.consume(std::move(cfq), db::no_timeout);
auto f = r_a_r->reader.consume(std::move(cfq), timeout);
return f.finally([r_a_r = std::move(r_a_r)] { });
}
@@ -2605,7 +2628,7 @@ void mutation_cleaner_impl::start_worker() {
stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexcept {
auto&& region = snp.region();
return with_allocator(region.allocator(), [&] {
return with_linearized_managed_bytes([&] {
{
// Allocating sections require the region to be reclaimable
// which means that they cannot be nested.
// It is, however, possible, that if the snapshot is taken
@@ -2617,13 +2640,15 @@ stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexce
}
try {
return _worker_state->alloc_section(region, [&] {
return with_linearized_managed_bytes([&] {
return snp.merge_partition_versions(_app_stats);
});
});
} catch (...) {
// Merging failed, give up as there is no guarantee of forward progress.
return stop_iteration::yes;
}
});
}
});
}

View File

@@ -650,6 +650,22 @@ public:
};
};
// Used to return the timestamp of the latest update to the row
struct max_timestamp {
api::timestamp_type max = api::missing_timestamp;
void update(api::timestamp_type ts) {
max = std::max(max, ts);
}
};
template<>
struct appending_hash<row> {
static constexpr int null_hash_value = 0xbeefcafe;
template<typename Hasher>
void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
};
class row_marker;
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;

View File

@@ -113,9 +113,6 @@ class reconcilable_result_builder {
const schema& _schema;
const query::partition_slice& _slice;
utils::chunked_vector<partition> _result;
uint32_t _live_rows{};
bool _return_static_content_on_partition_with_no_rows{};
bool _static_row_is_alive{};
uint32_t _total_live_rows = 0;
@@ -123,6 +120,10 @@ class reconcilable_result_builder {
stop_iteration _stop;
bool _short_read_allowed;
std::optional<streamed_mutation_freezer> _mutation_consumer;
uint32_t _live_rows{};
// make this the last member so it is destroyed first. #7240
utils::chunked_vector<partition> _result;
public:
reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
query::result_memory_accounter&& accounter)
@@ -206,5 +207,6 @@ public:
future<mutation_opt> counter_write_query(schema_ptr, const mutation_source&,
const dht::decorated_key& dk,
const query::partition_slice& slice,
tracing::trace_state_ptr trace_ptr);
tracing::trace_state_ptr trace_ptr,
db::timeout_clock::time_point timeout);

File diff suppressed because it is too large Load Diff

View File

@@ -372,6 +372,64 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no);
/// Make an auto-paused evictable reader.
///
/// The reader is paused after each use, that is after each call to any of its
/// members that cause actual reading to be done (`fill_buffer()` and
/// `fast_forward_to()`). When paused, the reader is made evictable, that it is
/// it is registered with reader concurrency semaphore as an inactive read.
/// The reader is resumed automatically on the next use. If it was evicted, it
/// will be recreated at the position it left off reading. This is all
/// transparent to its user.
/// Parameters passed by reference have to be kept alive while the reader is
/// alive.
flat_mutation_reader make_auto_paused_evictable_reader(
mutation_source ms,
schema_ptr schema,
reader_concurrency_semaphore& semaphore,
const dht::partition_range& pr,
const query::partition_slice& ps,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
mutation_reader::forwarding fwd_mr);
class evictable_reader;
class evictable_reader_handle {
friend std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(mutation_source, schema_ptr, reader_concurrency_semaphore&,
const dht::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, mutation_reader::forwarding);
private:
evictable_reader* _r;
private:
explicit evictable_reader_handle(evictable_reader& r);
public:
void pause();
};
/// Make a manually-paused evictable reader.
///
/// The reader can be paused via the evictable reader handle when desired. The
/// intended usage is subsequent reads done in bursts, after which the reader is
/// not used for some time. When paused, the reader is made evictable, that is,
/// it is registered with reader concurrency semaphore as an inactive read.
/// The reader is resumed automatically on the next use. If it was evicted, it
/// will be recreated at the position it left off reading. This is all
/// transparent to its user.
/// Parameters passed by reference have to be kept alive while the reader is
/// alive.
std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(
mutation_source ms,
schema_ptr schema,
reader_concurrency_semaphore& semaphore,
const dht::partition_range& pr,
const query::partition_slice& ps,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state,
mutation_reader::forwarding fwd_mr);
/// Reader lifecycle policy for the mulitshard combining reader.
///
/// This policy is expected to make sure any additional resource the readers

View File

@@ -38,8 +38,14 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
auto f2 = rd.is_buffer_empty() ? rd.fill_buffer(db::no_timeout) : make_ready_future<>();
return when_all_succeed(std::move(f1), std::move(f2));
});
}).finally([&wr] {
return wr.consume_end_of_stream();
}).then_wrapped([&wr] (future<> f) {
if (f.failed()) {
auto ex = f.get_exception();
wr.abort(ex);
return make_exception_future<>(ex);
} else {
return wr.consume_end_of_stream();
}
});
});
}

View File

@@ -57,6 +57,9 @@ class shard_based_splitting_mutation_writer {
}
return std::move(_consume_fut);
}
void abort(std::exception_ptr ep) {
_handle.abort(ep);
}
};
private:
@@ -108,6 +111,13 @@ public:
return shard->consume_end_of_stream();
});
}
void abort(std::exception_ptr ep) {
for (auto&& shard : _shards) {
if (shard) {
shard->abort(ep);
}
}
}
};
future<> segregate_by_shard(flat_mutation_reader producer, reader_consumer consumer) {

View File

@@ -144,6 +144,9 @@ class timestamp_based_splitting_mutation_writer {
}
return std::move(_consume_fut);
}
void abort(std::exception_ptr ep) {
_handle.abort(ep);
}
};
private:
@@ -186,6 +189,11 @@ public:
return bucket.second.consume_end_of_stream();
});
}
void abort(std::exception_ptr ep) {
for (auto&& b : _buckets) {
b.second.abort(ep);
}
}
};
future<> timestamp_based_splitting_mutation_writer::write_to_bucket(bucket_id bucket, mutation_fragment&& mf) {

View File

@@ -650,12 +650,12 @@ partition_snapshot_ptr partition_entry::read(logalloc::region& r,
return partition_snapshot_ptr(std::move(snp));
}
std::vector<range_tombstone>
partition_snapshot::range_tombstone_result
partition_snapshot::range_tombstones(position_in_partition_view start, position_in_partition_view end)
{
partition_version* v = &*version();
if (!v->next()) {
return boost::copy_range<std::vector<range_tombstone>>(
return boost::copy_range<range_tombstone_result>(
v->partition().row_tombstones().slice(*_schema, start, end));
}
range_tombstone_list list(*_schema);
@@ -665,10 +665,10 @@ partition_snapshot::range_tombstones(position_in_partition_view start, position_
}
v = v->next();
}
return boost::copy_range<std::vector<range_tombstone>>(list.slice(*_schema, start, end));
return boost::copy_range<range_tombstone_result>(list.slice(*_schema, start, end));
}
std::vector<range_tombstone>
partition_snapshot::range_tombstone_result
partition_snapshot::range_tombstones()
{
return range_tombstones(

View File

@@ -26,6 +26,7 @@
#include "utils/anchorless_list.hh"
#include "utils/logalloc.hh"
#include "utils/coroutine.hh"
#include "utils/chunked_vector.hh"
#include <boost/intrusive/parent_from_member.hpp>
#include <boost/intrusive/slist.hpp>
@@ -400,10 +401,13 @@ public:
::static_row static_row(bool digest_requested) const;
bool static_row_continuous() const;
mutation_partition squashed() const;
using range_tombstone_result = utils::chunked_vector<range_tombstone>;
// Returns range tombstones overlapping with [start, end)
std::vector<range_tombstone> range_tombstones(position_in_partition_view start, position_in_partition_view end);
range_tombstone_result range_tombstones(position_in_partition_view start, position_in_partition_view end);
// Returns all range tombstones
std::vector<range_tombstone> range_tombstones();
range_tombstone_result range_tombstones();
};
class partition_snapshot_ptr {

View File

@@ -163,6 +163,11 @@ public:
return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
}
// Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
static position_in_partition_view before_key(position_in_partition_view pos) {
return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
}
partition_region region() const { return _type; }
bound_weight get_bound_weight() const { return _bound_weight; }
bool is_partition_start() const { return _type == partition_region::partition_start; }

View File

@@ -27,6 +27,7 @@
reader_permit::impl::impl(reader_concurrency_semaphore& semaphore, reader_resources base_cost) : semaphore(semaphore), base_cost(base_cost) {
semaphore.consume(base_cost);
}
reader_permit::impl::~impl() {
@@ -88,7 +89,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
_resources += r;
while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
auto& x = _wait_list.front();
_resources -= x.res;
try {
x.pr.set_value(reader_permit(*this, x.res));
} catch (...) {
@@ -160,7 +160,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
--_inactive_read_stats.population;
}
if (may_proceed(r)) {
_resources -= r;
return make_ready_future<reader_permit>(reader_permit(*this, r));
}
promise<reader_permit> pr;
@@ -170,7 +169,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
}
reader_permit reader_concurrency_semaphore::consume_resources(resources r) {
_resources -= r;
return reader_permit(*this, r);
}

View File

@@ -128,6 +128,10 @@ private:
return has_available_units(r) && _wait_list.empty();
}
void consume(resources r) {
_resources -= r;
}
void consume_memory(size_t memory) {
_resources.memory -= memory;
}

View File

@@ -63,7 +63,7 @@ shared_ptr<abstract_command> exists::prepare(service::storage_proxy& proxy, requ
}
future<redis_message> exists::execute(service::storage_proxy& proxy, redis::redis_options& options, service_permit permit) {
return seastar::do_for_each(_keys, [&proxy, &options, &permit, this] (bytes key) {
return seastar::do_for_each(_keys, [&proxy, &options, permit, this] (bytes& key) {
return redis::read_strings(proxy, options, key, permit).then([this] (lw_shared_ptr<strings_result> result) {
if (result->has_result()) {
_count++;

View File

@@ -12,7 +12,11 @@
# At the end of the build we check that the build-id is indeed in the
# first page. At install time we check that patchelf doesn't modify
# the program headers.
# gdb has a SO_NAME_MAX_PATH_SIZE of 512, so limit the path size to
# that. The 512 includes the null at the end, hence the 511 bellow.
ORIGINAL_DYNAMIC_LINKER=$(gcc -### /dev/null -o t 2>&1 | perl -n -e '/-dynamic-linker ([^ ]*) / && print $1')
DYNAMIC_LINKER=$(printf "%2000s$ORIGINAL_DYNAMIC_LINKER" | sed 's| |/|g')
DYNAMIC_LINKER=$(printf "%511s$ORIGINAL_DYNAMIC_LINKER" | sed 's| |/|g')
echo $DYNAMIC_LINKER

View File

@@ -1945,7 +1945,7 @@ future<> rebuild_with_repair(seastar::sharded<database>& db, locator::token_meta
future<> replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm, std::unordered_set<dht::token> replacing_tokens) {
auto op = sstring("replace_with_repair");
auto source_dc = get_local_dc();
auto reason = streaming::stream_reason::bootstrap;
auto reason = streaming::stream_reason::replace;
tm.update_normal_tokens(replacing_tokens, utils::fb_utilities::get_broadcast_address());
return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc), reason);
}

View File

@@ -23,6 +23,7 @@
#include <unordered_map>
#include <exception>
#include <absl/container/btree_set.h>
#include <seastar/core/sstring.hh>
#include <seastar/core/sharded.hh>
@@ -334,6 +335,8 @@ public:
}
};
using repair_hash_set = absl::btree_set<repair_hash>;
// Return value of the REPAIR_GET_SYNC_BOUNDARY RPC verb
struct get_sync_boundary_response {
std::optional<repair_sync_boundary> boundary;

View File

@@ -47,6 +47,7 @@
#include "gms/gossiper.hh"
#include "repair/row_level.hh"
#include "mutation_source_metadata.hh"
#include "utils/stall_free.hh"
extern logging::logger rlogger;
@@ -372,6 +373,7 @@ private:
std::optional<utils::phased_barrier::operation> _local_read_op;
// Local reader or multishard reader to read the range
flat_mutation_reader _reader;
std::optional<evictable_reader_handle> _reader_handle;
// Current partition read from disk
lw_shared_ptr<const decorated_key_with_hash> _current_dk;
@@ -390,32 +392,49 @@ public:
, _sharder(remote_sharder, range, remote_shard)
, _seed(seed)
, _local_read_op(local_reader ? std::optional(cf.read_in_progress()) : std::nullopt)
, _reader(make_reader(db, cf, local_reader)) {
}
private:
flat_mutation_reader
make_reader(seastar::sharded<database>& db,
column_family& cf,
is_local_reader local_reader) {
, _reader(nullptr) {
if (local_reader) {
return cf.make_streaming_reader(_schema, _range);
auto ms = mutation_source([&cf] (
schema_ptr s,
reader_permit,
const dht::partition_range& pr,
const query::partition_slice& ps,
const io_priority_class& pc,
tracing::trace_state_ptr,
streamed_mutation::forwarding,
mutation_reader::forwarding fwd_mr) {
return cf.make_streaming_reader(std::move(s), pr, ps, fwd_mr);
});
std::tie(_reader, _reader_handle) = make_manually_paused_evictable_reader(
std::move(ms),
_schema,
cf.streaming_read_concurrency_semaphore(),
_range,
_schema->full_slice(),
service::get_local_streaming_read_priority(),
{},
mutation_reader::forwarding::no);
} else {
_reader = make_multishard_streaming_reader(db, _schema, [this] {
auto shard_range = _sharder.next();
if (shard_range) {
return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
}
return std::optional<dht::partition_range>();
});
}
return make_multishard_streaming_reader(db, _schema, [this] {
auto shard_range = _sharder.next();
if (shard_range) {
return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
}
return std::optional<dht::partition_range>();
});
}
public:
future<mutation_fragment_opt>
read_mutation_fragment() {
return _reader(db::no_timeout);
}
void on_end_of_stream() {
_reader = make_empty_flat_reader(_schema);
_reader_handle.reset();
}
lw_shared_ptr<const decorated_key_with_hash>& get_current_dk() {
return _current_dk;
}
@@ -434,9 +453,14 @@ public:
}
}
void pause() {
if (_reader_handle) {
_reader_handle->pause();
}
}
};
class repair_writer {
class repair_writer : public enable_lw_shared_from_this<repair_writer> {
schema_ptr _schema;
uint64_t _estimated_partitions;
size_t _nr_peer_nodes;
@@ -450,6 +474,7 @@ class repair_writer {
// written.
std::vector<bool> _partition_opened;
streaming::stream_reason _reason;
named_semaphore _sem{1, named_semaphore_exception_factory{"repair_writer"}};
public:
repair_writer(
schema_ptr schema,
@@ -494,9 +519,10 @@ public:
return _mq[node_idx]->pop_eventually();
};
table& t = db.local().find_column_family(_schema->id());
auto writer = shared_from_this();
_writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema,
make_generating_reader(_schema, std::move(get_next_mutation_fragment)),
[&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
[&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions, writer] (flat_mutation_reader reader) {
auto& t = db.local().find_column_family(reader.schema());
return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, reason).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
//FIXME: for better estimations this should be transmitted from remote
@@ -508,7 +534,7 @@ public:
sstables::shared_sstable sst = use_view_update_path ? t->make_streaming_staging_sstable() : t->make_streaming_sstable_for_write();
schema_ptr s = reader.schema();
auto& pc = service::get_local_streaming_write_priority();
return sst->write_components(std::move(reader), std::max(1ul, adjusted_estimated_partitions), s,
return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
t->get_sstables_manager().configure_writer(),
encoding_stats{}, pc).then([sst] {
return sst->open_data();
@@ -524,13 +550,13 @@ public:
return consumer(std::move(reader));
});
},
t.stream_in_progress()).then([this, node_idx] (uint64_t partitions) {
t.stream_in_progress()).then([node_idx, writer] (uint64_t partitions) {
rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
_schema->ks_name(), _schema->cf_name(), partitions);
}).handle_exception([this, node_idx] (std::exception_ptr ep) {
writer->_schema->ks_name(), writer->_schema->cf_name(), partitions);
}).handle_exception([node_idx, writer] (std::exception_ptr ep) {
rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
_schema->ks_name(), _schema->cf_name(), ep);
_mq[node_idx]->abort(ep);
writer->_schema->ks_name(), writer->_schema->cf_name(), ep);
writer->_mq[node_idx]->abort(ep);
return make_exception_future<>(std::move(ep));
});
}
@@ -561,11 +587,18 @@ public:
future<> write_end_of_stream(unsigned node_idx) {
if (_mq[node_idx]) {
return with_semaphore(_sem, 1, [this, node_idx] {
// Partition_end is never sent on wire, so we have to write one ourselves.
return write_partition_end(node_idx).then([this, node_idx] () mutable {
// Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
return _mq[node_idx]->push_eventually(mutation_fragment_opt());
}).handle_exception([this, node_idx] (std::exception_ptr ep) {
_mq[node_idx]->abort(ep);
rlogger.warn("repair_writer: keyspace={}, table={}, write_end_of_stream failed: {}",
_schema->ks_name(), _schema->cf_name(), ep);
return make_exception_future<>(std::move(ep));
});
});
} else {
return make_ready_future<>();
}
@@ -588,6 +621,10 @@ public:
return make_exception_future<>(std::move(ep));
});
}
named_semaphore& sem() {
return _sem;
}
};
class repair_meta {
@@ -623,7 +660,7 @@ private:
size_t _nr_peer_nodes= 1;
repair_stats _stats;
repair_reader _repair_reader;
repair_writer _repair_writer;
lw_shared_ptr<repair_writer> _repair_writer;
// Contains rows read from disk
std::list<repair_row> _row_buf;
// Contains rows we are working on to sync between peers
@@ -635,7 +672,7 @@ private:
// Tracks current sync boundary
std::optional<repair_sync_boundary> _current_sync_boundary;
// Contains the hashes of rows in the _working_row_buffor for all peer nodes
std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
std::vector<repair_hash_set> _peer_row_hash_sets;
// Gate used to make sure pending operation of meta data is done
seastar::gate _gate;
sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
@@ -704,7 +741,7 @@ public:
_seed,
repair_reader::is_local_reader(_repair_master || _same_sharding_config)
)
, _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes, _reason)
, _repair_writer(make_lw_shared<repair_writer>(_schema, _estimated_partitions, _nr_peer_nodes, _reason))
, _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
[] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
return netw::get_local_messaging_service().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
@@ -723,11 +760,12 @@ public:
public:
future<> stop() {
auto gate_future = _gate.close();
auto writer_future = _repair_writer.wait_for_writer_done();
auto f1 = _sink_source_for_get_full_row_hashes.close();
auto f2 = _sink_source_for_get_row_diff.close();
auto f3 = _sink_source_for_put_row_diff.close();
return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3));
return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).finally([this] {
return _repair_writer->wait_for_writer_done();
});
}
static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
@@ -855,9 +893,9 @@ public:
}
// Must run inside a seastar thread
static std::unordered_set<repair_hash>
get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
std::unordered_set<repair_hash> set_diff;
static repair_hash_set
get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
repair_hash_set set_diff;
// Note std::set_difference needs x and y are sorted.
std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
[&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
@@ -875,14 +913,14 @@ public:
}
std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
return _peer_row_hash_sets[node_idx];
}
// Get a list of row hashes in _working_row_buf
future<std::unordered_set<repair_hash>>
future<repair_hash_set>
working_row_hashes() {
return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
hashes.emplace(r.hash());
}).then([&hashes] {
@@ -1007,11 +1045,7 @@ private:
return repair_hash(h.finalize_uint64());
}
stop_iteration handle_mutation_fragment(mutation_fragment_opt mfopt, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
if (!mfopt) {
return stop_iteration::yes;
}
mutation_fragment& mf = *mfopt;
stop_iteration handle_mutation_fragment(mutation_fragment& mf, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
if (mf.is_partition_start()) {
auto& start = mf.as_partition_start();
_repair_reader.set_current_dk(start.key());
@@ -1046,32 +1080,49 @@ private:
}
_gate.check();
return _repair_reader.read_mutation_fragment().then([this, &cur_size, &new_rows_size, &cur_rows] (mutation_fragment_opt mfopt) mutable {
return handle_mutation_fragment(std::move(mfopt), cur_size, new_rows_size, cur_rows);
if (!mfopt) {
_repair_reader.on_end_of_stream();
return stop_iteration::yes;
}
return handle_mutation_fragment(*mfopt, cur_size, new_rows_size, cur_rows);
});
}).then([&cur_rows, &new_rows_size] () mutable {
}).then_wrapped([this, &cur_rows, &new_rows_size] (future<> fut) mutable {
if (fut.failed()) {
_repair_reader.on_end_of_stream();
return make_exception_future<std::list<repair_row>, size_t>(fut.get_exception());
}
_repair_reader.pause();
return make_ready_future<std::list<repair_row>, size_t>(std::move(cur_rows), new_rows_size);
});
});
}
future<> clear_row_buf() {
return utils::clear_gently(_row_buf);
}
future<> clear_working_row_buf() {
return utils::clear_gently(_working_row_buf).then([this] {
_working_row_buf_combined_hash.clear();
});
}
// Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
// Calculate the combined checksum of the rows
// Calculate the total size of the rows in _row_buf
future<get_sync_boundary_response>
get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
auto f = make_ready_future<>();
if (skipped_sync_boundary) {
_current_sync_boundary = skipped_sync_boundary;
_row_buf.clear();
_working_row_buf.clear();
_working_row_buf_combined_hash.clear();
} else {
_working_row_buf.clear();
_working_row_buf_combined_hash.clear();
f = clear_row_buf();
}
// Here is the place we update _last_sync_boundary
rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
_last_sync_boundary = _current_sync_boundary;
return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
return clear_working_row_buf().then([this, sb = sb] () mutable {
return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
size_t new_rows_nr = new_rows.size();
_row_buf.splice(_row_buf.end(), new_rows);
@@ -1088,6 +1139,8 @@ private:
});
});
});
});
});
}
future<> move_row_buf_to_working_row_buf() {
@@ -1163,9 +1216,9 @@ private:
}
future<std::list<repair_row>>
copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
return do_with(std::list<repair_row>(), std::move(set_diff),
[this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
[this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
if (set_diff.count(r.hash()) > 0) {
rows.push_back(r);
@@ -1180,7 +1233,7 @@ private:
// Give a set of row hashes, return the corresponding rows
// If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
future<std::list<repair_row>>
get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
if (needs_all_rows) {
if (!_repair_master || _nr_peer_nodes == 1) {
return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
@@ -1191,6 +1244,32 @@ private:
}
}
future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
return with_semaphore(_repair_writer->sem(), 1, [this, node_idx, update_buf, &row_diff] {
_repair_writer->create_writer(_db, node_idx);
return repeat([this, node_idx, update_buf, &row_diff] () mutable {
if (row_diff.empty()) {
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
repair_row& r = row_diff.front();
if (update_buf) {
_working_row_buf_combined_hash.add(r.hash());
}
// The repair_row here is supposed to have
// mutation_fragment attached because we have stored it in
// to_repair_rows_list above where the repair_row is created.
mutation_fragment mf = std::move(r.get_mutation_fragment());
auto dk_with_hash = r.get_dk_with_hash();
return _repair_writer->do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
row_diff.pop_front();
return make_ready_future<stop_iteration>(stop_iteration::no);
});
});
});
});
}
// Give a list of rows, apply the rows to disk and update the _working_row_buf and _peer_row_hash_sets if requested
// Must run inside a seastar thread
void apply_rows_on_master_in_thread(repair_rows_on_wire rows, gms::inet_address from, update_working_row_buf update_buf,
@@ -1204,30 +1283,17 @@ private:
stats().rx_row_nr += row_diff.size();
stats().rx_row_nr_peer[from] += row_diff.size();
if (update_buf) {
std::list<repair_row> tmp;
tmp.swap(_working_row_buf);
// Both row_diff and _working_row_buf and are ordered, merging
// two sored list to make sure the combination of row_diff
// and _working_row_buf are ordered.
std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
[this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
utils::merge_to_gently(_working_row_buf, row_diff,
[this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
}
if (update_hash_set) {
_peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
_peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
}
_repair_writer.create_writer(_db, node_idx);
for (auto& r : row_diff) {
if (update_buf) {
_working_row_buf_combined_hash.add(r.hash());
}
// The repair_row here is supposed to have
// mutation_fragment attached because we have stored it in
// to_repair_rows_list above where the repair_row is created.
mutation_fragment mf = std::move(r.get_mutation_fragment());
auto dk_with_hash = r.get_dk_with_hash();
_repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).get();
}
do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
}
future<>
@@ -1235,19 +1301,9 @@ private:
if (rows.empty()) {
return make_ready_future<>();
}
return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
unsigned node_idx = 0;
_repair_writer.create_writer(_db, node_idx);
return do_for_each(row_diff, [this, node_idx] (repair_row& r) {
// The repair_row here is supposed to have
// mutation_fragment attached because we have stored it in
// to_repair_rows_list above where the repair_row is created.
mutation_fragment mf = std::move(r.get_mutation_fragment());
auto dk_with_hash = r.get_dk_with_hash();
return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
});
});
return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
unsigned node_idx = 0;
return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
});
}
@@ -1326,13 +1382,13 @@ private:
public:
// RPC API
// Return the hashes of the rows in _working_row_buf
future<std::unordered_set<repair_hash>>
future<repair_hash_set>
get_full_row_hashes(gms::inet_address remote_node) {
if (remote_node == _myip) {
return get_full_row_hashes_handler();
}
return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
_repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
_repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
_metrics.rx_hashes_nr += hashes.size();
stats().rx_hashes_nr += hashes.size();
@@ -1343,7 +1399,7 @@ public:
private:
future<> get_full_row_hashes_source_op(
lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
lw_shared_ptr<repair_hash_set> current_hashes,
gms::inet_address remote_node,
unsigned node_idx,
rpc::source<repair_hash_with_cmd>& source) {
@@ -1381,12 +1437,12 @@ private:
}
public:
future<std::unordered_set<repair_hash>>
future<repair_hash_set>
get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
if (remote_node == _myip) {
return get_full_row_hashes_handler();
}
auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
auto current_hashes = make_lw_shared<repair_hash_set>();
return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
[this, current_hashes, remote_node, node_idx]
(rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
@@ -1401,7 +1457,7 @@ public:
}
// RPC handler
future<std::unordered_set<repair_hash>>
future<repair_hash_set>
get_full_row_hashes_handler() {
return with_gate(_gate, [this] {
return working_row_hashes();
@@ -1541,7 +1597,7 @@ public:
// RPC API
// Return rows in the _working_row_buf with hash within the given sef_diff
// Must run inside a seastar thread
void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
if (needs_all_rows || !set_diff.empty()) {
if (remote_node == _myip) {
return;
@@ -1610,11 +1666,11 @@ private:
}
future<> get_row_diff_sink_op(
std::unordered_set<repair_hash> set_diff,
repair_hash_set set_diff,
needs_all_rows_t needs_all_rows,
rpc::sink<repair_hash_with_cmd>& sink,
gms::inet_address remote_node) {
return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
if (inject_rpc_stream_error) {
return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
}
@@ -1641,7 +1697,7 @@ private:
public:
// Must run inside a seastar thread
void get_row_diff_with_rpc_stream(
std::unordered_set<repair_hash> set_diff,
repair_hash_set set_diff,
needs_all_rows_t needs_all_rows,
update_peer_row_hash_sets update_hash_set,
gms::inet_address remote_node,
@@ -1667,7 +1723,7 @@ public:
}
// RPC handler
future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
return to_repair_rows_on_wire(std::move(row_diff));
@@ -1677,15 +1733,16 @@ public:
// RPC API
// Send rows in the _working_row_buf with hash within the given sef_diff
future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
if (!set_diff.empty()) {
if (remote_node == _myip) {
return make_ready_future<>();
}
auto sz = set_diff.size();
size_t sz = set_diff.size();
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
if (row_diff.size() != sz) {
throw std::runtime_error("row_diff.size() != set_diff.size()");
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
}
return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
@@ -1752,17 +1809,18 @@ private:
public:
future<> put_row_diff_with_rpc_stream(
std::unordered_set<repair_hash> set_diff,
repair_hash_set set_diff,
needs_all_rows_t needs_all_rows,
gms::inet_address remote_node, unsigned node_idx) {
if (!set_diff.empty()) {
if (remote_node == _myip) {
return make_ready_future<>();
}
auto sz = set_diff.size();
size_t sz = set_diff.size();
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
if (row_diff.size() != sz) {
throw std::runtime_error("row_diff.size() != set_diff.size()");
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
}
return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
@@ -1801,7 +1859,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
rpc::sink<repair_row_on_wire_with_cmd> sink,
rpc::source<repair_hash_with_cmd> source,
bool &error,
std::unordered_set<repair_hash>& current_set_diff,
repair_hash_set& current_set_diff,
std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
@@ -1814,7 +1872,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
}
bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
_metrics.rx_hashes_nr += current_set_diff.size();
auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
if (fp.get_owner_shard() == this_shard_id()) {
@@ -1892,12 +1950,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
if (status == repair_stream_cmd::get_full_row_hashes) {
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
_metrics.tx_hashes_nr += hashes.size();
return hashes;
});
}).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
}).then([sink] (repair_hash_set hashes) mutable {
return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
}).then([sink] () mutable {
@@ -1920,7 +1978,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
uint32_t repair_meta_id,
rpc::sink<repair_row_on_wire_with_cmd> sink,
rpc::source<repair_hash_with_cmd> source) {
return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
if (hash_cmd_opt) {
@@ -1936,22 +1994,17 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
current_set_diff,
std::move(hash_cmd_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
error = true;
return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([sink] () mutable {
return sink.close();
}).then([sink] {
return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([] {
return make_ready_future<stop_iteration>(stop_iteration::no);
});
});
} else {
if (error) {
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
return sink.close().then([sink] {
return make_ready_future<stop_iteration>(stop_iteration::yes);
});
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
});
});
}).finally([sink] () mutable {
return sink.close().finally([sink] { });
});
}
@@ -1977,22 +2030,17 @@ static future<> repair_put_row_diff_with_rpc_stream_handler(
current_rows,
std::move(row_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
error = true;
return sink(repair_stream_cmd::error).then([sink] () mutable {
return sink.close();
}).then([sink] {
return sink(repair_stream_cmd::error).then([] {
return make_ready_future<stop_iteration>(stop_iteration::no);
});
});
} else {
if (error) {
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
return sink.close().then([sink] {
return make_ready_future<stop_iteration>(stop_iteration::yes);
});
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
});
});
}).finally([sink] () mutable {
return sink.close().finally([sink] { });
});
}
@@ -2017,22 +2065,17 @@ static future<> repair_get_full_row_hashes_with_rpc_stream_handler(
error,
std::move(status_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
error = true;
return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([sink] () mutable {
return sink.close();
}).then([sink] {
return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([] () {
return make_ready_future<stop_iteration>(stop_iteration::no);
});
});
} else {
if (error) {
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
return sink.close().then([sink] {
return make_ready_future<stop_iteration>(stop_iteration::yes);
});
return make_ready_future<stop_iteration>(stop_iteration::yes);
}
});
});
}).finally([sink] () mutable {
return sink.close().finally([sink] { });
});
}
@@ -2078,7 +2121,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
_metrics.tx_hashes_nr += hashes.size();
return hashes;
});
@@ -2106,11 +2149,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
});
});
ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
repair_hash_set set_diff, bool needs_all_rows) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
_metrics.rx_hashes_nr += set_diff.size();
auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
if (fp.get_owner_shard() == this_shard_id()) {
@@ -2178,6 +2221,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
});
}
future<> repair_uninit_messaging_service_handler() {
return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
return when_all_succeed(
ms.unregister_repair_get_row_diff_with_rpc_stream(),
ms.unregister_repair_put_row_diff_with_rpc_stream(),
ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
ms.unregister_repair_get_full_row_hashes(),
ms.unregister_repair_get_combined_row_hash(),
ms.unregister_repair_get_sync_boundary(),
ms.unregister_repair_get_row_diff(),
ms.unregister_repair_put_row_diff(),
ms.unregister_repair_row_level_start(),
ms.unregister_repair_row_level_stop(),
ms.unregister_repair_get_estimated_partitions(),
ms.unregister_repair_set_estimated_partitions(),
ms.unregister_repair_get_diff_algorithms()).discard_result();
});
}
class row_level_repair {
repair_info& _ri;
sstring _cf_name;
@@ -2407,7 +2469,7 @@ private:
// sequentially because the rows from repair follower 1 to
// repair master might reduce the amount of missing data
// between repair master and repair follower 2.
std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
// Request missing sets from peer node
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
@@ -2430,9 +2492,9 @@ private:
// So we can figure out which rows peer node are missing and send the missing rows to them
check_in_shutdown();
_ri.check_in_abort();
std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
auto sz = _all_live_peer_nodes.size();
std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
std::vector<repair_hash_set> set_diffs(sz);
for (size_t idx : boost::irange(size_t(0), sz)) {
set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
}

View File

@@ -45,6 +45,7 @@ private:
};
future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
future<> repair_uninit_messaging_service_handler();
class repair_info;

View File

@@ -528,8 +528,12 @@ public:
return _reader.move_to_next_partition(timeout).then([this] (auto&& mfopt) mutable {
{
if (!mfopt) {
this->handle_end_of_stream();
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
return _cache._read_section(_cache._tracker.region(), [&] {
return with_linearized_managed_bytes([&] {
this->handle_end_of_stream();
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
});
});
}
_cache.on_partition_miss();
const partition_start& ps = mfopt->as_partition_start();
@@ -952,13 +956,15 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
// expensive and we need to amortize it somehow.
do {
STAP_PROBE(scylla, row_cache_update_partition_start);
with_linearized_managed_bytes([&] {
{
if (!update) {
_update_section(_tracker.region(), [&] {
with_linearized_managed_bytes([&] {
memtable_entry& mem_e = *m.partitions.begin();
size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator());
auto cache_i = _partitions.lower_bound(mem_e.key(), cmp);
update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc);
});
});
}
// We use cooperative deferring instead of futures so that
@@ -970,14 +976,16 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
update = {};
real_dirty_acc.unpin_memory(size_entry);
_update_section(_tracker.region(), [&] {
with_linearized_managed_bytes([&] {
auto i = m.partitions.begin();
memtable_entry& mem_e = *i;
m.partitions.erase(i);
mem_e.partition().evict(_tracker.memtable_cleaner());
current_allocator().destroy(&mem_e);
});
});
++partition_count;
});
}
STAP_PROBE(scylla, row_cache_update_partition_end);
} while (!m.partitions.empty() && !need_preempt());
with_allocator(standard_allocator(), [&] {
@@ -1124,8 +1132,8 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
seastar::thread::maybe_yield();
while (true) {
auto done = with_linearized_managed_bytes([&] {
return _update_section(_tracker.region(), [&] {
auto done = _update_section(_tracker.region(), [&] {
return with_linearized_managed_bytes([&] {
auto cmp = cache_entry::compare(_schema);
auto it = _partitions.lower_bound(*_prev_snapshot_pos, cmp);
auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp);

View File

@@ -43,6 +43,8 @@
constexpr int32_t schema::NAME_LENGTH;
extern logging::logger dblog;
sstring to_sstring(column_kind k) {
switch (k) {
case column_kind::partition_key: return "PARTITION_KEY";
@@ -592,11 +594,15 @@ schema::get_column_definition(const bytes& name) const {
const column_definition&
schema::column_at(column_kind kind, column_id id) const {
return _raw._columns.at(column_offset(kind) + id);
return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
}
const column_definition&
schema::column_at(ordinal_column_id ordinal_id) const {
if (size_t(ordinal_id) >= _raw._columns.size()) {
on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
}
return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
}

View File

@@ -79,7 +79,8 @@ executables = ['build/{}/scylla'.format(args.mode),
'/usr/sbin/ethtool',
'/usr/bin/netstat',
'/usr/bin/hwloc-distrib',
'/usr/bin/hwloc-calc']
'/usr/bin/hwloc-calc',
'/usr/bin/lsblk']
output = args.dest

View File

@@ -597,7 +597,7 @@ def current_shard():
def find_db(shard=None):
if not shard:
if shard is None:
shard = current_shard()
return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']

Some files were not shown because too many files have changed in this diff Show More