Compare commits

...

527 Commits

Author SHA1 Message Date
Hagit Segev
b0f656302c release: prepare for 4.1.11 2021-01-05 10:13:34 +02:00
Benny Halevy
e05e7b2a98 compaction: compaction_writer: destroy shared_sstable after the sstable_writer
sstable_writer may depend on the sstable throughout its whole lifecycle.
If the sstable is freed before the sstable_writer we might hit use-after-free
as in the follwing case:
```
std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator+=(long) at /usr/include/c++/10/bits/stl_deque.h:240
 (inlined by) std::operator+(std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*> const&, long) at /usr/include/c++/10/bits/stl_deque.h:378
 (inlined by) std::_Deque_iterator<sstables::compression::segmented_offsets::bucket, sstables::compression::segmented_offsets::bucket&, sstables::compression::segmented_offsets::bucket*>::operator[](long) const at /usr/include/c++/10/bits/stl_deque.h:252
 (inlined by) std::deque<sstables::compression::segmented_offsets::bucket, std::allocator<sstables::compression::segmented_offsets::bucket> >::operator[](unsigned long) at /usr/include/c++/10/bits/stl_deque.h:1327
 (inlined by) sstables::compression::segmented_offsets::push_back(unsigned long, sstables::compression::segmented_offsets::state&) at ./sstables/compress.cc:214
sstables::compression::segmented_offsets::writer::push_back(unsigned long) at ./sstables/compress.hh:123
 (inlined by) compressed_file_data_sink_impl<crc32_utils, (compressed_checksum_mode)1>::put(seastar::temporary_buffer<char>) at ./sstables/compress.cc:519
seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at table.cc:?
 (inlined by) seastar::output_stream<char>::put(seastar::temporary_buffer<char>) at ././seastar/include/seastar/core/iostream-impl.hh:432
seastar::output_stream<char>::flush() at table.cc:?
seastar::output_stream<char>::close() at table.cc:?
sstables::file_writer::close() at sstables.cc:?
sstables::mc::writer::~writer() at writer.cc:?
 (inlined by) sstables::mc::writer::~writer() at ./sstables/mx/writer.cc:790
sstables::mc::writer::~writer() at writer.cc:?
flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at compaction.cc:?
 (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_destroy() at /usr/include/c++/10/optional:260
 (inlined by) std::_Optional_payload_base<sstables::compaction_writer>::_M_reset() at /usr/include/c++/10/optional:280
 (inlined by) std::_Optional_payload<sstables::compaction_writer, false, false, false>::~_Optional_payload() at /usr/include/c++/10/optional:401
 (inlined by) std::_Optional_base<sstables::compaction_writer, false, false>::~_Optional_base() at /usr/include/c++/10/optional:474
 (inlined by) std::optional<sstables::compaction_writer>::~optional() at /usr/include/c++/10/optional:659
 (inlined by) sstables::compacting_sstable_writer::~compacting_sstable_writer() at ./sstables/compaction.cc:229
 (inlined by) compact_mutation<(emit_only_live_rows)0, (compact_for_sstables)1, sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_mutation() at ././mutation_compactor.hh:468
 (inlined by) compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>::~compact_for_compaction() at ././mutation_compactor.hh:538
 (inlined by) std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::operator()(compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>*) const at /usr/include/c++/10/bits/unique_ptr.h:85
 (inlined by) std::unique_ptr<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer>, std::default_delete<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~unique_ptr() at /usr/include/c++/10/bits/unique_ptr.h:361
 (inlined by) stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >::~stable_flattened_mutations_consumer() at ././mutation_reader.hh:342
 (inlined by) flat_mutation_reader::impl::consumer_adapter<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >::~consumer_adapter() at ././flat_mutation_reader.hh:201
auto flat_mutation_reader::impl::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:272
 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter>(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, flat_mutation_reader::no_filter, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:383
 (inlined by) auto flat_mutation_reader::consume_in_thread<stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> > >(stable_flattened_mutations_consumer<compact_for_compaction<sstables::compacting_sstable_writer, noop_compacted_fragments_consumer> >, std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long, std::ratio<1l, 1000l> > >) at ././flat_mutation_reader.hh:389
 (inlined by) seastar::future<void> sstables::compaction::setup<noop_compacted_fragments_consumer>(noop_compacted_fragments_consumer)::{lambda(flat_mutation_reader)#1}::operator()(flat_mutation_reader)::{lambda()#1}::operator()() at ./sstables/compaction.cc:612
```

What happens here is that:

    compressed_file_data_sink_impl(output_stream<char> out, sstables::compression* cm, sstables::local_compression lc)
            : _out(std::move(out))
            , _compression_metadata(cm)
            , _offsets(_compression_metadata->offsets.get_writer())
            , _compression(lc)
            , _full_checksum(ChecksumType::init_checksum())

_compression_metadata points to a buffer held by the sstable object.
and _compression_metadata->offsets.get_writer returns a writer that keeps
a reference to the segmented_offsets in the sstables::compression
that is used in the ~writer -> close path.

Fixes #7821

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20201227145726.33319-1-bhalevy@scylladb.com>
(cherry picked from commit 8a745a0ee0)
2021-01-04 15:12:33 +02:00
Avi Kivity
ae0f3ef543 Revert "Merge 'Move temporaries to value view' from Piotr S"
This reverts commit b34a1d9576. It causes
regressions in processing of bind variables.

Fixes #7761.
2020-12-24 12:42:42 +02:00
Gleb Natapov
2a6a072857 mutation_writer: pass exceptions through feed_writer
feed_writer() eats exception and transforms it into an end of stream
instead. Downstream validators hate when this happens.

Fixes #7482
Message-Id: <20201216090038.GB3244976@scylladb.com>

(cherry picked from commit 61520a33d6)
2020-12-16 17:20:32 +02:00
Aleksandr Bykov
da1a5b6542 dist: scylla_util: fix aws_instance.ebs_disks method
aws_instance.ebs_disks() method should return ebs disk
instead of ephemeral

Signed-off-by: Aleksandr Bykov <alex.bykov@scylladb.com>

Closes #7780

(cherry picked from commit e74dc311e7)
2020-12-16 11:59:12 +02:00
Avi Kivity
b85aa0e8a6 Update seastar submodule
* seastar 9d8d82a095...6fb1399ba1 (1):
  > sharded: Do not hang on never set freed promise

Fixes #6606.
2020-12-15 16:52:38 +02:00
Calle Wilund
8ccdd5c50f token_metadata: Prune empty racks on endpoint change
Fixes #6459

When moving or removing endpoints, we should ensure
that the set of available racks reflect the nodes
known, i.e. match what would be the result of a
reboot + create sets initially.
Message-Id: <20200519153300.15391-1-calle@scylladb.com>

(cherry picked from commit 7ce4a8b458)
2020-12-15 16:31:46 +02:00
Takuya ASADA
f7ffea4638 node_exporter_install: stop service before force installing
Stop node-exporter.service before re-install it, to avoid 'Text file busy' error.

Fixes #6782

(cherry picked from commit ef05ea8e91)
2020-12-15 16:28:36 +02:00
Avi Kivity
fb40e375bf dist: rpm: uninstall tuned when installing scylla-kernel-conf
tuned 2.11.0-9 and later writes to kerned.sched_wakeup_granularity_ns
and other sysctl tunables that we so laboriously tuned, dropping
performance by a factor of 5 (due to increased latency). Fix by
obsoleting tuned during install (in effect, we are a better tuned,
at least for us).

Not needed for .deb, since debian/ubunto do not install tuned by
default.

Fixes #7696

Closes #7776

(cherry picked from commit 615b8e8184)
2020-12-12 14:32:59 +02:00
Eliran Sinvani
9ea2a61d63 consistency level: fix wrong quorum calculation whe RF = 0
We used to calculate the number of endpoints for quorum and local_quorum
unconditionally as ((rf / 2) + 1). This formula doesn't take into
account the corner case where RF = 0, in this situation quorum should
also be 0.
This commit adds the missing corner case.

Tests: Unit Tests (dev)
Fixes #6905

Closes #7296

(cherry picked from commit 925cdc9ae1)
2020-11-29 16:45:26 +02:00
Avi Kivity
6898fcd40f Update seastar submodule for precalculated TLS DH parameters
* seastar d4df4fa6de...9d8d82a095 (1):
  > TLS: Use "known" (precalculated) DH parameters if available

Fixes #6191.
2020-11-29 14:36:40 +02:00
Asias He
4df08e331b repair: Make repair_writer a shared pointer
The future of the fiber that writes data into sstables inside
the repair_writer is stored in _writer_done like below:

class repair_writer {
   _writer_done[node_idx] =
      mutation_writer::distribute_reader_and_consume_on_shards().then([this] {
         ...
      }).handle_exception([this] {
         ...
      });
}

The fiber access repair_writer object in the error handling path. We
wait for the _writer_done to finish before we destroy repair_meta
object which contains the repair_writer object to avoid the fiber
accessing already freed repair_writer object.

To be safer, we can make repair_writer a shared pointer and take a
reference in the distribute_reader_and_consume_on_shards code path.

Fixes #7406

Closes #7430

(cherry picked from commit 289a08072a)
2020-11-29 13:30:06 +02:00
Pavel Emelyanov
7b1fb86a28 query_pager: Fix continuation handling for noop visitor
Before updating the _last_[cp]key (for subsequent .fetch_page())
the pager checks is 'if the pager is not exhausted OR the result
has data'.

The check seems broken: if the pager is not exhausted, but the
result is empty the call for keys will unconditionally try to
reference the last element from empty vector. The not exhausted
condition for empty result can happen if the short_read is set,
which, in turn, unconditionally happens upon meeting partition
end when visiting the partition with result builder.

The correct check should be 'if the pager is not exhausted AND
the result has data': the _last_[pc]key-s should be taken for
continuation (not exhausted), but can be taken if the result is
not empty (has data).

fixes: #7263
tests: unit(dev), but tests don't trigger this corner case

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200921124329.21209-1-xemul@scylladb.com>
(cherry picked from commit 550fc734d9)
2020-11-29 12:01:43 +02:00
Takuya ASADA
f7be22ccb2 install.sh: set PATH for relocatable CLI tools in python thunk
We currently set PATH for relocatable CLI tools in scylla_util.run() and
scylla_util.out(), but it doesn't work for perftune.py, since it's not part of
Scylla, does not use scylla_util module.
We can set PATH in python thunk instead, it can set PATH for all python scripts.

Fixes #7350

(cherry picked from commit 5867af4edd)
2020-11-29 11:54:53 +02:00
Bentsi Magidovich
26b5a34f96 scylla_util.py: fix exception handling in curl
Retry mechanism didn't work when URLError happend. For example:

  urllib.error.URLError: <urlopen error [Errno 101] Network is unreachable>

Let's catch URLError instead of HTTP since URLError is a base exception
for all exceptions in the urllib module.

Fixes: #7569

Closes #7567

(cherry picked from commit 956b97b2a8)
2020-11-29 11:48:42 +02:00
Takuya ASADA
10a65ba2fb dist/redhat: packaging dependencies.conf as normal file, not ghost
When we introduced dependencies.conf, we mistakenly added it on rpm as %ghost,
but it should be normal file, should be installed normally on package installation.

Fixes #7703

Closes #7704

(cherry picked from commit ba4d54efa3)
2020-11-29 11:40:27 +02:00
Takuya ASADA
be60e3ca52 install.sh: apply sysctl.d files on non-packaging installation
We don't apply sysctl.d files on non-packaging installation, apply them
just like rpm/deb taking care of that.

Fixes #7702

Closes #7705

(cherry picked from commit 5f81f97773)
2020-11-29 11:35:51 +02:00
Avi Kivity
5485c902fe dist: sysctl: configure more inotify instances
Since f3bcd4d205 ("Merge 'Support SSL Certificate Hot
Reloading' from Calle"), we reload certificates as they are
modified on disk. This uses inotify, which is limited by a
sysctl fs.inotify.max_user_instances, with a default of 128.

This is enough for 64 shards only, if both rpc and cql are
encrypted; above that startup fails.

Increase to 1200, which is enough for 6 instances * 200 shards.

Fixes #7700.

Closes #7701

(cherry picked from commit 390e07d591)
2020-11-29 11:04:57 +02:00
Hagit Segev
01c822301f release: prepare for 4.1.10 2020-11-19 18:07:49 +02:00
Raphael S. Carvalho
415b271a39 compaction: Make sure a partition is filtered out only by producer
If interposer consumer is enabled, partition filtering will be done by the
consumer instead, but that's not possible because only the producer is able
to skip to the next partition if the current one is filtered out, so scylla
crashes when that happens with a bad function call in queue_reader.
This is a regression which started here: 55a8b6e3c9

To fix this problem, let's make sure that partition filtering will only
happen on the producer side.

Fixes #7590.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20201111221513.312283-1-raphaelsc@scylladb.com>
(cherry picked from commit 13fa2bec4c)
2020-11-19 14:08:47 +02:00
Piotr Dulikowski
b7274ab44a hints: don't read hint files when it's not allowed to send
When there are hint files to be sent and the target endpoint is DOWN,
end_point_hints_manager works in the following loop:

- It reads the first hint file in the queue,
- For each hint in the file it decides that it won't be sent because the
  target endpoint is DOWN,
- After realizing that there are some unsent hints, it decides to retry
  this operation after sleeping 1 second.

This causes the first segment to be wholly read over and over again,
with 1 second pauses, until the target endpoint becomes UP or leaves the
cluster. This causes unnecessary I/O load in the streaming scheduling
group.

This patch adds a check which prevents end_point_hints_manager from
reading the first hint file at all when it is not allowed to send hints.

First observed in #6964

Tests:
- unit(dev)
- hinted handoff dtests

Closes #7407

(cherry picked from commit 77a0f1a153)
2020-11-16 14:30:26 +02:00
Botond Dénes
b144b93cd8 mutation_reader: queue_reader: don't set EOS flag on abort
If the consumer happens to check the EOS flag before it hits the
exception injected by the abort (by calling fill_buffer()), they can
think the stream ended normally and expect it to be valid. However this
is not guaranteed when the reader is aborted. To avoid consumers falsely
thinking the stream ended normally, don't set the EOS flag on abort at
all.

Additionally make sure the producer is aborted too on abort. In theory
this is not needed as they are the one initiating the abort, but better
to be safe then sorry.

Fixes: #7411
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20201102100732.35132-1-bdenes@scylladb.com>
(cherry picked from commit f5323b29d9)
2020-11-15 11:08:07 +02:00
Botond Dénes
7325996510 types: validate(): linearize values lazily
Instead of eagerly linearizing all values as they are passed to
validate(), defer linearization to those validators that actually need
linearized values. Linearizing large values puts pressure on the memory
allocator with large contiguous allocation requests. This is something
we are trying to actively avoid, especially if it is not really neaded.
Turns out the types, whose validators really want linearized values are
a minority, as most validators just look at the size of the value, and
some like bytes don't need validation at all, while usually having large
values.

This is achieved by templating the validator struct on the view and
using the FragmentedRange concept to treat all passed in views
(`bytes_view` and `fragmented_temporary_buffer_view`) uniformly.
This patch makes no attempt at converting existing validators to work
with fragmented buffers, only trivial cases are converted. The major
offenders still left are ascii/utf8 and collections.

Fixes: #7318

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20201007054524.909420-1-bdenes@scylladb.com>
(cherry picked from commit db56ae695c)

[avi: squashed ed6775c585 ("types: adjust
      validation_visitor construction for clang") as gcc 9 in scylla 4.1
      suffers from the same problem as clang 11]
2020-11-11 12:31:36 +02:00
Piotr Sarna
fb14fae79b Merge 'Backport PR #7469 to 4.2' from Eliran Sinvani
This is a backport of PR #7469 that did not apply cleanly to 4.2 with a trivial conflict, another commit that touched one of the files but in a completely different region.

Closes #7480

* github.com:scylladb/scylla:
  materialized views: add a base table reference if missing
  view info: support partial match between base and view for only reading from view.
  view info: guard against null dereference of the base info

(cherry picked from commit c74ba1bc36)
2020-11-09 15:22:11 +02:00
Avi Kivity
bb49a5ac06 Merge 'storage_proxy: add a separate smp_group for hints' from Eliran
Hints writes are handled by storage_proxy in the exact same way
regular writes are, which in turn means that the same smp service
group is used for both. The problem is that it can lead to a priority
inversion where writes of the lower priority  kind occupies a lot of
the semaphores units making the higher priority writes wait for an
empty slot.
This series adds a separate smp group for hints as well as a field
to pass the correct smp group to mutate_locally functions, and
then uses this field to properly classify the writes.

Fixes #7177

* eliransin-hint_priority_inversion:
  Storage proxy: use hints smp group in mutate locally
  Storage proxy: add a dedicated smp group for hints

(cherry picked from commit c075539fea)

[avi: replace std::bind_front() which is not available with this
      compiler with a lambda that does the same]
2020-11-08 20:46:45 +02:00
Pavel Solodovnikov
947d3a13a3 storage_proxy: un-hardcode force sync flag for mutate_locally(mutation) overload
Corresponding overload of `storage_proxy::mutate_locally`
was hardcoded to pass `db::commitlog::force_sync::no` to the
`database::apply`. Unhardcode it and substitute `force_sync::no`
to all existing call sites (as it were before).

`force_sync::yes` will be used later for paxos learn writes
when trying to apply mutations upgraded from an obsolete
schema version (similar to the current case when applying
locally a `frozen_mutation` stored in accepted proposal).

Tests: unit(dev)

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200716124915.464789-1-pa.solodovnikov@scylladb.com>
(cherry picked from commit 5ff5df1afd)

Prerequisite for #7177.
2020-11-08 19:47:11 +02:00
Amnon Heiman
b096d64aa7 scyllatop/livedata.py: Safe iteration over metrics
This patch change the code that iterates over the metrics to use a copy
of the metrics names to make it safe to remove the metrics from the
metrics object.

Fixes #7488

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 52db99f25f)
2020-11-08 19:16:25 +02:00
Calle Wilund
ce8a0f3886 partition_version: Change range_tombstones() to return chunked_vector
Refs #7364

The number of tombstones can be large. As a stopgap measure to
just returning a source range (with keepalive), we can at least
alleviate the problem by using a chunked vector.

Closes #7433

(cherry picked from commit 4b65d67a1a)
2020-11-08 14:38:45 +02:00
Tomasz Grabiec
41344d8ee6 sstables: ka/la: Fix abort when next_partition() is called with certain reader state
Cleanup compaction is using consume_pausable_in_thread() to skip over
disowned partitions, which uses flat_mutation_reader::next_partition().

The implementation of next_partition() for the sstable reader has a
bug which may cause the following assertion failure:

  scylla: sstables/mp_row_consumer.hh:422: row_consumer::proceed sstables::mp_row_consumer_k_l::flush(): Assertion `!_ready' failed.

This happens when the sstable reader's buffer gets full when we reach
the partition end. The last fragment of the partition won't be pushed
into the buffer but will stay in the _ready variable. When
next_partition() is called in this state, _ready will not be cleared
and the fragment will be carried over to the next partition. This will
cause assertion failure when the reader attempts to emit the first
fragment of the next partition.

The fix is to clear _ready when entering a partition, just like we
clear _range_tombstones there.

Fixes #7553.
Message-Id: <1604534702-12777-1-git-send-email-tgrabiec@scylladb.com>

(cherry picked from commit fb9b5cae05)
2020-11-08 14:32:58 +02:00
Avi Kivity
db6303dba0 Merge "Fix TWCS compaction aggressiveness due to data segregation" from Raphael
"
After data segregation feature, anything that cause out-of-order writes,
like read repair, can result in small updates to past time windows.
This causes compaction to be very aggressive because whenever a past time
window is updated like that, that time window is recompacted into a
single SSTable.
Users expect that once a window is closed, it will no longer be written
to, but that has changed since the introduction of the data segregation
future. We didn't anticipate the write amplification issues that the
feature would cause. To fix this problem, let's perform size-tiered
compaction on the windows that are no longer active and were updated
because data was segregated. The current behavior where the last active
window is merged into one file is kept. But thereafter, that same
window will only be compacted using STCS.

Fixes #6928.
"

* 'fix_twcs_agressiveness_after_data_segregation_v2' of github.com:raphaelsc/scylla:
  compaction/twcs: improve further debug messages
  compaction/twcs: Improve debug log which shows all windows
  test: Check that TWCS properly performs size-tiered compaction on past windows
  compaction/twcs: Make task estimation take into account the size-tiered behavior
  compaction/stcs: Export static function that estimates pending tasks
  compaction/stcs: Make get_buckets() static
  compact/twcs: Perform size-tiered compaction on past time windows
  compaction/twcs: Make strategy easier to extend by removing duplicated knowledge
  compaction/twcs: Make newest_bucket() non-static
  compaction/twcs: Move TWCS implementation into source file

(cherry picked from commit 6f986df458)
2020-11-05 20:32:42 +02:00
Glauber Costa
964cbb95a7 twcs: move implementations to its own file
LCS and SCTS already have their own files, reducing the clutter in
compaction_strategy.cc. Do the same for TWCS. I am doing this in
preparation to add more functions.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200611230906.409023-6-glauber@scylladb.com>
(cherry picked from commit b0a0c207c3)

Prerequisite for #6928.
2020-11-05 20:20:30 +02:00
Avi Kivity
b34a1d9576 Merge 'Move temporaries to value view' from Piotr S
"
Issue https://github.com/scylladb/scylla/issues/7019 describes a problem of an ever-growing map of temporary values stored in query_options. In order to mitigate this kind of problems, the storage for temporary values is moved from an external data structure to the value views itself. This way, the temporary lives only as long as it's accessible and is automatically destroyed once a request finishes. The downside is that each temporary is now allocated separately, while previously they were bundled in a single byte stream.

Tests: unit(dev)
Fixes https://github.com/scylladb/scylla/issues/7019
"

7055297649 ("cql3: remove query_options::linearize and _temporaries")
is reverted from this backport since linearize() is still used in
this branch.

* psarna-move_temporaries_to_value_view:
  cql3: remove query_options::linearize and _temporaries
  cql3: remove make_temporary helper function
  cql3: store temporaries in-place instead of in query_options
  cql3: add temporary_value to value view
  cql3: allow moving data out of raw_value
  cql3: split values.hh into a .cc file

(cherry picked from commit 2b308a973f)
2020-11-05 19:48:01 +02:00
Piotr Sarna
15ef930268 schema_tables: fix fixing old secondary index schemas
Old secondary index schemas did not have their idx_token column
marked as computed, and there already exists code which updates
them. Unfortunately, the fix itself contains an error and doesn't
fire if computed columns are not yet supported by the whole cluster,
which is a very common situation during upgrades.

Fixes #7515

Closes #7516

(cherry picked from commit b66c285f94)
2020-11-05 17:53:28 +02:00
Avi Kivity
fe57128fe0 Merge 'Fix ignoring cells after null in appending hash' from Piotr Sarna
"
This series fixes a bug in `appending_hash<row>` that caused it to ignore any cells after the first NULL. It also adds a cluster feature which starts using the new hashing only after the whole cluster is aware of it. The series comes with tests, which reproduce the issue.

Fixes #4567
Based on #4574
"

* psarna-fix_ignoring_cells_after_null_in_appending_hash:
  test: extend mutation_test for NULL values
  tests/mutation: add reproducer for #4567
  gms: add a cluster feature for fixed hashing
  digest: add null values to row digest
  mutation_partition: fix formatting
  appending_hash<row>: make publicly visible

(cherry picked from commit 0e03c979d2)
2020-11-04 20:45:06 +02:00
Yaron Kaikov
b80dab6d58 release: prepare for 4.1.9 2020-10-26 18:13:22 +02:00
Botond Dénes
04d52631b2 reader_permit: reader_resources: make true RAII class
Currently in all cases we first deduct the to-be-consumed resources,
then construct the `reader_resources` class to protect it (release it on
destruction). This is error prone as it relies on no exception being
thrown while constructing the `reader_resources`. Albeit the
`reader_resources` constructor is `noexcept` right now this might change
in the future and as the call sites relying on this are disconnected
from the declaration, the one modifying them might not notice.
To make this safe going forward, make the `reader_resources` a true RAII
class, consuming the units in its constructor and releasing them in its
destructor.

Refs: #7256

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200922150625.1253798-1-bdenes@scylladb.com>
(cherry picked from commit a0107ba1c6)
Message-Id: <20200924081408.236353-1-bdenes@scylladb.com>
2020-10-19 15:04:53 +03:00
Takuya ASADA
dfc9f789cf install.sh: set LC_ALL=en_US.UTF-8 on python3 thunk
scylla-python3 causes segfault when non-default locale specified.
As workaround for this, we need to set LC_ALL=en_US.UTF_8 on python3 thunk.

Fixes #7408

Closes #7414

(cherry picked from commit ff129ee030)
2020-10-18 15:02:46 +03:00
Avi Kivity
c1236c02df Update seastar submodule
* seastar 88b6f0172c...d4df4fa6de (1):
  > append_challenged_posix_file_impl: allow destructing file with no queued work

Fixes #7285.
2020-10-12 15:13:17 +03:00
Gleb Natapov
0eb2f5c378 lwt: do not return unavailable exception from the 'learn' stage
Unavailable exception means that operation was not started and it can be
retried safely. If lwt fails in the learn stage though it most
certainly means that its effect will be observable already. The patch
returns timeout exception instead which means uncertainty.

Fixes #7258

Message-Id: <20201001130724.GA2283830@scylladb.com>
(cherry picked from commit 3e8dbb3c09)
2020-10-07 11:00:08 +02:00
Avi Kivity
0cc6d41ee6 Merge "materialized views: Fix undefined behavior on base table schema changes" from Tomasz
"
The view_info object, which is attached to the schema object of the
view, contains a data structure called
"base_non_pk_columns_in_view_pk". This data structure contains column
ids of the base table so is valid only for a particular version of the
base table schema. This data structure is used by materialized view
code to interpret mutations of the base table, those coming from base
table writes, or reads of the base table done as part of view updates
or view building.

The base table schema version of that data structure must match the
schema version of the mutation fragments, otherwise we hit undefined
behavior. This may include aborts, exceptions, segfaults, or data
corruption (e.g. writes landing in the wrong column in the view).

Before this patch, we could get schema version mismatch here after the
base table was altered. That's because the view schema did not change
when the base table was altered.

Another problem was that view building was using the current table's schema
to interpret the fragments and invoke view building. That's incorrect for two
reasons. First, fragments generated by a reader must be accessed only using
the reader's schema. Second, base_non_pk_columns_in_view_pk of the recorded
view ptrs may not longer match the current base table schema, which is used
to generate the view updates.

Part of the fix is to extract base_non_pk_columns_in_view_pk into a
third entity called base_dependent_view_info, which changes both on
base table schema changes and view schema changes.

It is managed by a shared pointer so that we can take immutable
snapshots of it, just like with schema_ptr. When starting the view
update, the base table schema_ptr and the corresponding
base_dependent_view_info have to match. So we must obtain them
atomically, and base_dependent_view_info cannot change during update.

Also, whenever the base table schema changes, we must update
base_dependent_view_infos of all attached views (atomically) so that
it matches the base table schema.

Fixes #7061.

Tests:

  - unit (dev)
  - [v1] manual (reproduced using scylla binary and cqlsh)
"

* tag 'mv-schema-mismatch-fix-v2' of github.com:tgrabiec/scylla:
  db: view: Refactor view_info::initialize_base_dependent_fields()
  tests: mv: Test dropping columns from base table
  db: view: Fix incorrect schema access during view building after base table schema changes
  schema: Call on_internal_error() when out of range id is passed to column_at()
  db: views: Fix undefined behavior on base table schema changes
  db: views: Introduce has_base_non_pk_columns_in_view_pk()

(cherry picked from commit 3daa49f098)
2020-10-06 16:49:08 +03:00
Juliusz Stasiewicz
1ecc447f42 tracing: Fix error on slow batches
`trace_keyspace_helper::make_slow_query_mutation_data` expected a
"query" key in its parameters, which does not appear in case of
e.g. batches of prepared statements. This is example of failing
`record.parameters`:
```
...{"query[0]" : "INSERT INTO ks.tbl (pk, i) values (?, ?);"},
{"query[1]" : "INSERT INTO ks.tbl (pk, i) values (?, ?);"}...
```

In such case Scylla recorded no trace and said:
```
ERROR 2020-09-28 10:09:36,696 [shard 3] trace_keyspace_helper - No
"query" parameter set for a session requesting a slow_query_log record
```

Fix here is to leave query empty if not found. The users can still
retrieve the query contents from existing info.

Fixes #5843

Closes #7293

(cherry picked from commit 0afa738a8f)
2020-10-04 18:04:42 +03:00
Tomasz Grabiec
7f3ffbc1c8 Merge "evictable_reader: validate buffer on reader recreation" from Botond
This series backports the evictable reader validation patchset (merged
as 97c99ea9f to master) to 4.1.

I only had to do changes to the tests.

Tests: unit(dev), some exception safety tests are failing with or
without my patchset

* https://github.com/denesb/scylla.git denesb/evictable-reader-validate-buffer/backport-4.1:
  mutation_reader_test: add unit test for evictable reader self-validation
  evictable_reader: validate buffer after recreation the underlying
  evictable_reader: update_next_position(): only use peek'd position on partition boundary
  mutation_reader_test: add unit test for evictable reader range tombstone trimming
  evictable_reader: trim range tombstones to the read clustering range
  position_in_partition_view: add position_in_partition_view before_key() overload
  flat_mutation_reader: add buffer() accessor
2020-10-02 11:50:29 +02:00
Botond Dénes
6a02d120ec mutation_reader_test: add unit test for evictable reader self-validation
Add both positive (where the validation should succeed) and negative
(where the validation should fail) tests, covering all validation cases.

(cherry picked from commit 076c27318b)
2020-10-02 09:45:20 +03:00
Botond Dénes
d820997452 evictable_reader: validate buffer after recreation the underlying
The reader recreation mechanism is a very delicate and error-prone one,
as proven by the countless bugs it had. Most of these bugs were related
to the recreated reader not continuing the read from the expected
position, inserting out-of-order fragments into the stream.
This patch adds a defense mechanism against such bugs by validating the
start position of the recreated reader. Several things are checked:
* The partition is the expected one -- the one we were in the middle of
  or the next if we stopped at partition boundaries.
* The partition is in the read range.
* The first fragment in the partition is the expected one -- has a
  an equal or larger position than the next expected fragment.
* The fragment is in the clustering range as defined by the slice.

As these validations are only done on the slow-path of recreating an
evicted reader, no performance impact is expected.

(cherry picked from commit 0b0ae18a14)
2020-10-02 09:38:04 +03:00
Botond Dénes
e1e57d224b evictable_reader: update_next_position(): only use peek'd position on partition boundary
`evictable_reader::update_next_position()` is used to record the position the
reader will continue from, in the next buffer fill. This position is used to
create the partition slice when the underlying reader is evicted and has
to be recreated. There is an optimization in this method -- if the
underlying's buffer is not empty we peek at the first fragment in it and
use it as the next position. This is however problematic for buffer
validation on reader recreation (introduced in the next patch), because
using the next row's position as the next pos will allow for range
tombstones to be emitted with before_key(next_pos.key()), which will
trigger the validation. Instead of working around this, just drop this
optimization for mid-partition positions, it is inconsequential anyway.
We keep it for where it is important, when we detect that we are at a
partition boundary. In this case we can avoid reading the current
partition altogether when recreating the reader.

(cherry picked from commit 91020eef73)
2020-10-02 09:38:04 +03:00
Botond Dénes
763e063356 mutation_reader_test: add unit test for evictable reader range tombstone trimming
(cherry picked from commit d1b0573e1c)
2020-10-02 09:37:57 +03:00
Botond Dénes
a8f966aafa evictable_reader: trim range tombstones to the read clustering range
Currently mutation sources are allowed to emit range tombstones that are
out-of the clustering read range if they are relevant to it. For example
a read of a clustering range [ck100, +inf), might start with:

    range_tombstone{start={ck1, -1}, end={ck200, 1}},
    clustering_row{ck100}

The range tombstone is relevant to the range and the first row of the
range so it is emitted as first, but its position (start) is outside the
read range. This is normally fine, but it poses a problem for evictable
reader. When the underlying reader is evicted and has to be recreated
from a certain clustering position, this results in out-of-order
mutation fragments being inserted into the middle of the stream. This is
not fine anymore as the monotonicity guarantee of the stream is
violated. The real solution would be to require all mutation sources to
trim range tombstones to their read range, but this is a lot of work.
Until that is done, as a workaround we do this trimming in the evictable
reader itself.

(cherry picked from commit 4f2e7a18e2)
2020-10-02 08:59:55 +03:00
Botond Dénes
1a3c8a0ec5 position_in_partition_view: add position_in_partition_view before_key() overload
(cherry picked from commit d7d93aef49)
2020-10-02 08:59:55 +03:00
Botond Dénes
268821223c flat_mutation_reader: add buffer() accessor
To allow outsiders to inspect the contents of the reader's buffer.

(cherry picked from commit ab59e7c725)
2020-10-02 08:59:55 +03:00
Tomasz Grabiec
6c43a0dc29 schema: Fix race in schema version recalculation leading to stale schema version in gossip
Migration manager installs several feature change listeners:

    if (this_shard_id() == 0) {
        _feature_listeners.push_back(_feat.cluster_supports_view_virtual_columns().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_digest_insensitive_to_expiry().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_cdc().when_enabled(update_schema));
        _feature_listeners.push_back(_feat.cluster_supports_per_table_partitioners().when_enabled(update_schema));
    }

They will call update_schema_version_and_announce() when features are enabled, which does this:

    return update_schema_version(proxy, features).then([] (utils::UUID uuid) {
        return announce_schema_version(uuid);
    });

So it first updates the schema version and then publishes it via
gossip in announce_schema_version(). It is possible that the
announce_schema_version() part of the first schema change will be
deferred and will execute after the other four calls to
update_schema_version_and_announce(). It will install the old schema
version in gossip instead of the more recent one.

The fix is to serialize schema digest calculation and publishing.

Fixes #7200

(cherry picked from commit 1a57d641d1)
2020-10-01 18:18:21 +02:00
Yaron Kaikov
8399aac6bc release: prepare for 4.1.8 2020-09-28 20:25:06 +03:00
Avi Kivity
b1a70d0ad4 Update sesatar submodule
* seastar 15cd93729f...88b6f0172c (1):
  > lz4_fragmented_compressor: Fix buffer requirements

Fixes #6925.
2020-09-23 11:55:54 +03:00
Yaron Kaikov
2251a1c577 release: prepare for 4.1.7 2020-09-17 21:30:34 +03:00
Nadav Har'El
f8c7c485d2 alternator: fix corruption of PutItem operation in case of contention
This patch fixes a bug noted in issue #7218 - where PutItem operations
sometimes lose part of the item's data - some attributes were lost,
and the name of other attributes replaced by empty strings. The problem
happened when the write-isolation policy was LWT and there was contention
of writes to the same partition (not necessarily the same item).

To use CAS (a.k.a. LWT), Alternator builds an alternator::rmw_operation
object with an apply() function which takes the old contents of the item
(if needed) and a timestamp, and builds a mutation that the CAS should
apply. In the case of the PutItem operation, we wrongly assumed that apply()
will be called only once - so as an optimization the strings saved in the
put_item_operation were moved into the returned mutation. But this
optimization is wrong - when there is contention, apply() may be called
again when the changed proposed by the previous one was not accepted by
the Paxos protocol.

The fix is to change the one place where put_item_operation *moved* strings
out of the saved operations into the mutations, to be a copy. But to prevent
this sort of bug from reoccuring in future code, this patch enlists the
compiler to help us verify that it can't happen: The apply() function is
marked "const" - it can use the information in the operation to build the
mutation, but it can never modify this information or move things out of it,
so it will be fine to call this function twice.

The single output field that apply() does write (_return_attributes) is
marked "mutable" to allow the const apply() to write to it anyway. Because
apply() might be called twice, it is important that if some apply()
implementation sometimes sets _return_attributes, then it must always
set it (even if to the default, empty, value) on every call to apply().

The const apply() means that the compiler verfies for us that I didn't
forget to fix additional wrong std::move()s. Additionally, a test I wrote
to easily reproduce issue #7218 (which I will submit as a dtest later)
passes after this fix.

Fixes #7218.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200916064906.333420-1-nyh@scylladb.com>
(cherry picked from commit 5e8bdf6877)
2020-09-16 21:26:59 +03:00
Benny Halevy
d60bed1953 test: cql_query_test: test_cache_bypass: use table stats
test is currently flaky since system reads can happen
in the background and disturb the global row cache stats.

Use the table's row_cache stats instead.

Fixes #6773

Test: cql_query_test.test_cache_bypass(dev, debug)

Credit-to: Botond Dénes <bdenes@scylladb.com>
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200811140521.421813-1-bhalevy@scylladb.com>
(cherry picked from commit 6deba1d0b4)
2020-09-16 18:19:30 +03:00
Dejan Mircevski
259203a394 cql3: Fix NULL reference in get_column_defs_for_filtering
There was a typo in get_column_defs_for_filtering(): it checked the
wrong pointer before dereferencing.  Add a test exposing the NULL
dereference and fix the typo.

Tests: unit (dev)

Fixes #7198.

Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
(cherry picked from commit 9d02f10c71)
2020-09-16 15:47:04 +03:00
Avi Kivity
5f284633d4 reconcilable_result_builder: don't aggrevate out-of-memory condition during recovery
Consider an unpaged query that consumes all of available memory, despite
fea5067dfa which limits them (perhaps the
user raised the limit, or this is a system query). Eventually we will see a
bad_alloc which will abort the query and destroy this reconcilable_result_builder.

During destruction, we first destroy _memory_accounter, and then _result.
Destroying _memory_accounter resumes some continuations which can then
allocate memory synchronously when increasing the task queue to accomodate
them. We will then crash. Had we not crashed, we would immediately afterwards
release _result, freeing all the memory that we would ever need.

Fix by making _result the last member, so it is freed first.

Fixes #7240.

(cherry picked from commit 9421cfded4)
2020-09-16 15:40:58 +03:00
Asias He
66cc4be8f6 storage_service: Fix a TOKENS update race for replace operation
In commit 7d86a3b208 (storage_service:
Make replacing node take writes), application state of TOKENS of the
replacing node is added into gossip and propagated to the cluster after
the initial start of gossip service. This can cause a race below

1. The replacing node replaces the old dead node with the same ip address
2. The replacing node starts gossip without application state of the TOKENS
3. Other nodes in the cluster replace the application states of old dead node's
   version with the new replacing node's version
4. replacing node dies
5. replace operation is performed again, the TOKENS application state is
   not preset and replace operation fails.

To fix, we can always add TOKENS application state when the
gossip service starts.

Fixes: #7166
Backports: 4.1 and 4.2
(cherry picked from commit 3ba6e3d264)
2020-09-10 13:13:58 +03:00
Avi Kivity
9ca6aa5535 Merge "Fix repair stalls in get_sync_boundary and apply_rows_on_master_in_thread" from Asias
"
This path set fixes stalls in repair that are caused by std::list merge and clear operations during test_latency_read_with_nemesis test.

Fixes #6940
Fixes #6975
Fixes #6976
"

* 'fix_repair_list_stall_merge_clear_v2' of github.com:asias/scylla:
  repair: Fix stall in apply_rows_on_master_in_thread and apply_rows_on_follower
  repair: Use clear_gently in get_sync_boundary to avoid stall
  utils: Add clear_gently
  repair: Use merge_to_gently to merge two lists
  utils: Add merge_to_gently

(cherry picked from commit 4547949420)
2020-09-10 13:13:54 +03:00
Avi Kivity
6e63db8c72 repair: apply_rows_on_follower(): remove copy of repair_rows list
We copy a list, which was reported to generate a 15ms stall.

This is easily fixed by moving it instead, which is safe since this is
the last use of the variable.

Fixes #7115.

(cherry picked from commit 6ff12b7f79)
2020-09-10 11:53:29 +03:00
Avi Kivity
803da18727 Update seastar submodule
* seastar 18275cbc0e...15cd93729f (1):
  > core/reactor: complete_timers(): restore previous scheduling group

Fixes #7184.
2020-09-07 11:33:06 +03:00
Raphael S. Carvalho
165d89860e compaction: Prevent non-regular compaction from picking compacting SSTables
After 8014c7124, cleanup can potentially pick a compacting SSTable.
Upgrade and scrub can also pick a compacting SSTable.
The problem is that table::candidates_for_compaction() was badly named.
It misleads the user into thinking that the SSTables returned are perfect
candidates for compaction, but manager still need to filter out the
compacting SSTables from the returned set. So it's being renamed.

When the same SSTable is compacted in parallel, the strategy invariant
can be broken like overlapping being introduced in LCS, and also
some deletion failures as more than one compaction process would try
to delete the same files.

Let's fix scrub, cleanup and ugprade by calling the manager function
which gets the correct candidates for compaction.

Fixes #6938.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200811200135.25421-1-raphaelsc@scylladb.com>
(cherry picked from commit 11df96718a)
2020-09-06 18:40:56 +03:00
Takuya ASADA
4a5116a0ae aws: update enhanced networking supported instance list
Sync enhanced networking supported instance list to latest one.

Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html

Fixes #6991

(cherry picked from commit 7cccb018b8)
2020-09-06 18:21:28 +03:00
Yaron Kaikov
6d9ff622df release: prepare for 4.1.6 2020-08-30 21:34:46 +03:00
Nadav Har'El
65bc33c921 redis: fix another use-after-free crash in "exists" command
Never trust Occam's Razor - it turns out that the use-after-free bug in the
"exists" command was caused by two separate bugs. We fixed one in commit
9636a33993, but there is a second one fixed in
this patch.

The problem fixed here was that a "service_permit" object, which is designed to
be copied around from place to place (it contains a shared pointer, so is cheap
to copy), was saved by reference, and the reference was to a function argument
and was destroyed prematurely.

This time I tested *many times* that that test_strings.py passes on both dev and
debug builds.

Note that test/run/redis still fails in a debug build, but due to a different
problem.

Fixes #6469

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200825183313.120331-1-nyh@scylladb.com>
(cherry picked from commit 868194cd17)
2020-08-27 12:25:03 +03:00
Nadav Har'El
5e90f06ca2 redis: fix use-after-free crash in "exists" command
A missing "&" caused the key stored in a long-living command to be copied
and the copy quickly freed - and then used after freed.
This caused the test test_strings.py::test_exists_multiple_existent_key for
this feature to frequently crash.

Fixes #6469

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200823190141.88816-1-nyh@scylladb.com>
(cherry picked from commit 9636a33993)
2020-08-27 12:25:03 +03:00
Asias He
2036de3245 compaction_manager: Avoid stall in perform_cleanup
The following stall was seen during a cleanup operation:

scylla: Reactor stalled for 16262 ms on shard 4.

| std::_MakeUniq<locator::tokens_iterator_impl>::__single_object std::make_unique<locator::tokens_iterator_impl, locator::tokens_iterator_impl&>(locator::tokens_iterator_impl&) at /usr/include/fmt/format.h:1158
|  (inlined by) locator::token_metadata::tokens_iterator::tokens_iterator(locator::token_metadata::tokens_iterator const&) at ./locator/token_metadata.cc:1602
| locator::simple_strategy::calculate_natural_endpoints(dht::token const&, locator::token_metadata&) const at simple_strategy.cc:?
|  (inlined by) locator::simple_strategy::calculate_natural_endpoints(dht::token const&, locator::token_metadata&) const at ./locator/simple_strategy.cc:56
| locator::abstract_replication_strategy::get_ranges(gms::inet_address, locator::token_metadata&) const at /usr/include/fmt/format.h:1158
| locator::abstract_replication_strategy::get_ranges(gms::inet_address) const at /usr/include/fmt/format.h:1158
| service::storage_service::get_ranges_for_endpoint(seastar::basic_sstring<char, unsigned int, 15u, true> const&, gms::inet_address const&) const at /usr/include/fmt/format.h:1158
| service::storage_service::get_local_ranges(seastar::basic_sstring<char, unsigned int, 15u, true> const&) const at /usr/include/fmt/format.h:1158
|  (inlined by) operator() at ./sstables/compaction_manager.cc:691
|  (inlined by) _M_invoke at /usr/include/c++/9/bits/std_function.h:286
| std::function<std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable> > > (table const&)>::operator()(table const&) const at /usr/include/fmt/format.h:1158
|  (inlined by) compaction_manager::rewrite_sstables(table*, sstables::compaction_options, std::function<std::vector<seastar::lw_shared_ptr<sstables::sstable>, std::allocator<seastar::lw_shared_ptr<sstables::sstable> > > (table const&)>) at ./sstables/compaction_manager.cc:604
| compaction_manager::perform_cleanup(table*) at /usr/include/fmt/format.h:1158

To fix, we furturize the function to get local ranges and sstables.

In addition, this patch removes the dependency to global storage_service object.

Fixes #6662

(cherry picked from commit 07e253542d)
2020-08-27 12:25:03 +03:00
Raphael S. Carvalho
0924e4d92f sstables: optimize procedure that checks if a sstable needs cleanup
needs_cleanup() returns true if a sstable needs cleanup.

Turns out it's very slow because it iterates through all the local
ranges for all sstables in the set, making its complexity:
	O(num_sstables * local_ranges)

We can optimize it by taking into account that abstract_replication_strategy
documents that get_ranges() will return a list of ranges that is sorted
and non-overlapping. Compaction for cleanup already takes advantage of that
when checking if a given partition can be actually purged.

So needs_cleanup() can be optimized into O(num_sstables * log(local_ranges)).

With num_sstables=1000, RF=3, then local_ranges=256(num_tokens)*3, it means
the max # of checks performed will go from 768000 to ~9584.

Fixes #6730.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200629171355.45118-2-raphaelsc@scylladb.com>
(cherry picked from commit cf352e7c14)
2020-08-27 12:25:03 +03:00
Raphael S. Carvalho
b8313775c5 sstables: export needs_cleanup()
May be needed elsewhere, like in an unit test.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200629171355.45118-1-raphaelsc@scylladb.com>
(cherry picked from commit a9eebdc778)
2020-08-27 12:25:02 +03:00
Asias He
ec0002a67f abstract_replication_strategy: Add get_ranges_in_thread
Add a version that runs inside a seastar thread. The benefit is that
get_ranges can yield to avoid stalls.

Refs #6662

(cherry picked from commit 94995acedb)
2020-08-27 12:24:55 +03:00
Asias He
ebdf5f9e55 gossip: Fix race between shutdown message handler and apply_state_locally
1. The node1 is shutdown
2. The node1 sends shutdown message to node2
3. The node2 receives gossip shutdown message but the handler yields
4. The node1 is restarted
5. The node1 sends new gossip endpoint_state to node2, node2 applies the state
   in apply_state_locally and calls gossiper::handle_major_state_change
   and then calls gossiper::mark_alive
6. The shutdown message handler in step 3 resumes and sets status of node1 to SHUTDOWN
7. The gossiper::mark_alive fiber in step 5 resumes and calls gossiper::real_mark_alive,
   node2 will skip to mark node1 as alive because the status of node1 is
   SHUTDOWN. As a result, node1 is alive but it is not marked as UP by node2.

To fix, we serialize the two operations.

Fixes #7032

(cherry picked from commit e6ceec1685)
2020-08-27 11:15:59 +03:00
Nadav Har'El
32c0e4f110 alternator test: configurable temporary directory
The test/alternator/run script creates a temporary directory for the Scylla
database in /tmp. The assumption was that this is the fastest disk (usually
even a ramdisk) on the test machine, and we didn't need anything else from
it.

But it turns out that on some systems, /tmp is actually a slow disk, so
this patch adds a way to configure the temporary directory - if the TMPDIR
environment variable exists, it is used instead of /tmp. As before this
patch, a temporary subdirectry is created in $TMPDIR, and this subdirectory
is automatically deleted when the test ends.

The test.py script already passes an appropriate TMPDIR (testlog/$mode),
which after this patch the Alternator test will use instead of /tmp.

Fixes #6750

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200713193023.788634-1-nyh@scylladb.com>
(cherry picked from commit 8e3be5e7d6)
2020-08-26 19:37:38 +03:00
Nadav Har'El
5f48444a98 alternator: fix order conditions on binary attributes
We implemented the order operators (LT, GT, LE, GE, BETWEEN) incorrectly
for binary attributes: DynamoDB requires that the bytes be treated as
unsigned for the purpose of order (so byte 128 is higher than 127), but
our implementation uses Scylla's "bytes" type which has signed bytes.

The solution is simple - we can continue to use the "bytes" type, but
we need to use its compare_unsigned() function, not its "<" operator.

This bug affected conditional operations ("Expected" and
"ConditionExpression") and also filters ("QueryFilter", "ScanFilter",
"FilterExpression"). The bug did *not* affect Query's key conditions
("KeyConditions", "KeyConditionExpression") because those already
used Scylla's key comparison functions - which correctly compare binary
blobs as unsigned bytes (in fact, this is why we have the
compare_unsigned() function).

The patch also adds tests that reproduce the bugs in conditional
operations, and show that the bug did not exist in key conditions.

Fixes #6573

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200603084257.394136-1-nyh@scylladb.com>
(cherry picked from commit f6b1f45d69)
Manually removed tests in test_key_conditions.py that did not exist in this branch
2020-08-26 18:47:28 +03:00
Avi Kivity
8930ea5407 Merge "Unregister RPC verbs on stop" from Pavel E
"
There are 5 services, that register their RPC handlers in messaging
service, but quite a few of them unregister them on stop.

Unregistering is somewhat critical, not just because it makes the
code look clean, but also because unregistration does wait for the
message processing to complete, thus avoiding use-after-free's in
the handlers.

In particular, several handlers call service::get_schema_for_write()
which, in turn, may end up in service::maybe_sync() calling for
the local migration manager instance. All those handlers' processing
must be waited for before stopping the migration manager.

The set brings the RPC handlers unregistration in sync with the
registration part.

tests: unit (dev)
       dtest (dev: simple_boot_shutdown, repair)
       start-stop by hands (dev)
fixes: #6904
"

* 'br-rpc-unregister-verbs' of https://github.com/xemul/scylla:
  main: Add missing calls to unregister RPC hanlers
  messaging: Add missing per-service unregistering methods
  messaging: Add missing handlers unregistration helpers
  streaming: Do not use db->invoke_on_all in vain
  storage_proxy: Detach rpc unregistration from stop
  main: Shorten call to storage_proxy::init_messaging_service

(cherry picked from commit 01b838e291)
2020-08-26 14:42:17 +03:00
Raphael S. Carvalho
311cd6403c cql3/statements: verify that counter column cannot be added into non-counter table
A check, to validate that counter column cannot be added into non-counter table,
is missing for alter table statement. Validation is performed when building new
schema, but it's limited to checking that a schema will not contain both counter
and non-counter columns.

Due to lack of validation, the added counter column could be incorrectly
persisted to the schema, but this results in a crash when setting the new
schema to its table. On restart, it can be confirmed that the schema change
was indeed persisted when describing the table.
This problem is fixed by doing proper validation for the alter table statement,
which consists of making sure a new counter column cannot be added to a
non-counter table.

The test cdc_disallow_cdc_for_counters_test is adjusted because one of its tests
was built on the assumption that counter column can be added into a non-counter
table.

Fixes #7065.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200824155709.34743-1-raphaelsc@scylladb.com>
(cherry picked from commit 1c29f0a43d)
2020-08-25 18:45:30 +03:00
Takuya ASADA
b71821435a dist/debian: disable debuginfo compression on .deb
Since older binutils on some distribution does not able to handle
compressed debuginfo generated on Fedora, we need to disable it.
However, debian packager force debuginfo compression since debian/compat = 9,
we have to uncompress them after compressed automatically.

Fixes #6982

(cherry picked from commit 75c2362c95)
2020-08-23 19:02:57 +03:00
Botond Dénes
cd29e2643c scylla-gdb.py: find_db(): don't return current shard's database for shard=0
The `shard` parameter of `find_db()` is optional and is defaulted to
`None`. When missing, the current shard's database instance is returned.
The problem is that the if condition checking this uses `not shard`,
which also evaluates to `True` if `shard == 0`, resulting in returning
the current shard's database instance for shard 0. Change the condition
to `shard is None` to avoid this.

Fixes: #7016
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200812091546.1704016-1-bdenes@scylladb.com>
(cherry picked from commit 4cfab59eb1)
2020-08-23 18:56:26 +03:00
Avi Kivity
59aa1834a7 Merge "repair: row_level: prevent deadlocks when repairing homogenous nodes" from Botond
"
This series backports the series "repair: row_level: prevent deadlocks
when repairing homogenous nodes" (merged as a9c7a1a86) to branch-4.1.
"

Fixes #6272

* 'repair-row-level-evictable-local-reader/branch-4.1' of https://github.com/denesb/scylla:
  repair: row_level: destroy reader on EOS or error
  repair: row_level: use evictable_reader for local reads
  mutation_reader: expose evictable_reader
  mutation_reader: evictable_reader: add auto_pause flag
  mutation_reader: make evictable_reader a flat_mutation_reader
  mutation_reader: s/inactive_shard_read/inactive_evictable_reader/
  mutation_reader: move inactive_shard_reader code up
  mutation_reader: fix indentation
  mutation_reader: shard_reader: extract remote_reader as evictable_reader
  mutation_reader: reader_lifecycle_policy: make semaphore() available early
2020-08-23 18:06:12 +03:00
Botond Dénes
436b305286 view_update_generator: fix race between registering and processing sstables
fea83f6 introduced a race between processing (and hence removing)
sstables from `_sstables_with_tables` and registering new ones. This
manifested in sstables that were added concurrently with processing a
batch for the same sstables being dropped and the semaphore units
associated with them not returned. This resulted in repairs being
blocked indefinitely as the units of the semaphore were effectively
leaked.

This patch fixes this by moving the contents of `_sstables_with_tables`
to a local variable before starting the processing. A unit test
reproducing the problem is also added.

Fixes: #6892

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200817160913.2296444-1-bdenes@scylladb.com>
(cherry picked from commit 22a6493716)
2020-08-23 18:04:29 +03:00
Botond Dénes
1d85051e8d repair: row_level: destroy reader on EOS or error
To avoid having to make it an optional with all the additional checks,
we just replace it with an empty reader instead, this also also achieves
the desired effect of releasing the read permit and all the associated
resources early.

(cherry picked from commit fbbc86e18c)
2020-08-20 16:10:16 +03:00
Botond Dénes
3f52d8733b repair: row_level: use evictable_reader for local reads
Row level repair, when using a local reader, is prone to deadlocking on
the streaming reader concurrency semaphore. This has been observed to
happen with at least two participating nodes, running more concurrent
repairs than the maximum allowed amount of reads by the concurrency
semaphore. In this situation, it is possible that two repair instances,
competing for the last available permits on both nodes, get a permit on
one of the nodes and get queued on the other one respectively. As
neither will let go of the permit it already acquired, nor give up
waiting on the failed-to-acquired permit, a deadlock happens.

To prevent this, we make the local repair reader evictable. For this we
reuse the newly exposed evictable reader.
The repair reader is paused after the repair buffer is filled, which is
currently 32MB, so the cost of a possible reader recreation is amortized
over 32MB read.

The repair reader is said to be local, when it can use the shard-local
partitioner. This is the case if the participating nodes are homogenous
(their shard configuration is identical), that is the repair instance
has to read just from one shard. A non-local reader uses the multishard
reader, which already makes its shard readers evictable and hence is not
prone to the deadlock described here.

(cherry picked from commit 080f00b99a)
2020-08-20 16:10:16 +03:00
Botond Dénes
eece444547 mutation_reader: expose evictable_reader
Expose functions for the outside world to create evictable readers. We
expose two functions, which create an evictable reader with
`auto_pause::yes` and `auto_pause::no` respectively. The function
creating the latter also returns a handle in addition to the reader,
which can be used to pause the reader.

(cherry picked from commit 542d9c3711)
2020-08-20 16:10:16 +03:00
Botond Dénes
2ab51c4055 mutation_reader: evictable_reader: add auto_pause flag
Currently the evictable reader unconditionally pauses the underlying
reader after each use (`fill_buffer()` or `fast_forward_to()` call).
This is fine for current users (the multishard reader), but the future
user we are doing all this refactoring for -- repair -- will want to
control when the underlying reader is paused "manually". Both these
behaviours can easily be supported in a single implementation, so we
add an `auto_pause` flag to allow the creator of the evictable reader
to control this.

(cherry picked from commit 1cc31deff9)
2020-08-20 16:10:16 +03:00
Botond Dénes
4a1a1feb55 mutation_reader: make evictable_reader a flat_mutation_reader
The `evictable_reader` class is almost a proper flat mutation reader
already, it roughly offers the same interface. This patch makes this
formal: changing the class to inherit from `flat_mutation_reader::impl`,
and implement all virtual methods. This also entails a departure from
using the lifecycle policy to pause/resume and create readers, instead
using more general building blocks like the reader concurrency semaphore
and a mutation source.

(cherry picked from commit af9e1c23e1)
2020-08-20 16:10:16 +03:00
Botond Dénes
76995933e0 mutation_reader: s/inactive_shard_read/inactive_evictable_reader/
Rename `inactive_shard_read` to `inactive_evictable_reader` to reflect
that the fact that the evictable reader is going to be of general use,
not specific to the multishard reader.

(cherry picked from commit 4485864ada)
2020-08-20 16:10:16 +03:00
Botond Dénes
f840263fdd mutation_reader: move inactive_shard_reader code up
It will be used by the `evictable_reader` code too in the next patches.

(cherry picked from commit b6ed054c08)
2020-08-20 16:10:16 +03:00
Botond Dénes
b4887ce4a5 mutation_reader: fix indentation
Deferred from the previous patch.

(cherry picked from commit e3ea1c9080)
2020-08-20 16:10:16 +03:00
Botond Dénes
849e12bf2e mutation_reader: shard_reader: extract remote_reader as evictable_reader
We want to make the evictable reader mechanism used in the multishard
reader pipeline available for general (re)use, as a standalone
flat mutation reader implementation. The first step is extracting
`shard_reader::remote_reader` the class implementing this logic into a
top-level class, also renamed to `evictable_reader`.

(cherry picked from commit f9d1916499)
2020-08-20 16:10:16 +03:00
Botond Dénes
f124f97f99 mutation_reader: reader_lifecycle_policy: make semaphore() available early
Currently all reader lifecycle policy implementations assume that
`semaphore()` will only be called after at least one call to
`make_reader()`. This assumption will soon not hold, so make sure
`semaphore()` can be called at any time, including before any calls are
made to `make_reader()`.

(cherry picked from commit 63309f925c)
2020-08-20 16:10:16 +03:00
Botond Dénes
4ee0b489cf table: get_sstables_by_partition_key(): don't make a copy of selected sstables
Currently we assign the reference to the vector of selected sstables to
`auto sst`. This makes a copy and we pass this local variable to
`do_for_each()`, which will result in a use-after-free if the latter
defers.
Fix by not making a copy and instead just keep the reference.

Fixes: #7060

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200818091241.2341332-1-bdenes@scylladb.com>
(cherry picked from commit 78f94ba36a)
2020-08-19 00:02:01 +03:00
Yaron Kaikov
382dcb9d34 release: prepare for 4.1.5 2020-08-17 21:55:06 +03:00
Calle Wilund
07b7df9171 cdc::log: Missing "preimage" check in row deletion pre-image
Fixes #6561

Pre-image generation in row deletion case only checked if we had a pre-image
result set row. But that can be from post-image. Also check actual existance
of the pre-image CK.
Message-Id: <20200608132804.23541-1-calle@scylladb.com>

(cherry picked from commit 5105e9f5e1)
2020-08-12 13:52:45 +03:00
Nadav Har'El
7fa3a988e3 Update Seastar submodule
> http: add "Expect: 100-continue" handling

Fixes #6844
2020-08-11 13:16:16 +03:00
Asias He
7b23574224 repair: Switch to btree_set for repair_hash.
In one of the longevity tests, we observed 1.3s reactor stall which came from
repair_meta::get_full_row_hashes_source_op. It traced back to a call to
std::unordered_set::insert() which triggered big memory allocation and
reclaim.

I measured std::unordered_set, absl::flat_hash_set, absl::node_hash_set
and absl::btree_set. The absl::btree_set was the only one that seastar
oversized allocation checker did not warn in my tests where around 300K
repair hashes were inserted into the container.

- unordered_set:
hash_sets=295634, time=333029199 ns

- flat_hash_set:
hash_sets=295634, time=312484711 ns

- node_hash_set:
hash_sets=295634, time=346195835 ns

- btree_set:
hash_sets=295634, time=341379801 ns

The btree_set is a bit slower than unordered_set but it does not have
huge memory allocation. I do not measure real difference of total time
to finish repair of the same dataset with unordered_set and btree_set.

To fix, switch to absl btree_set container.

Fixes #6190

(cherry picked from commit 67f6da6466)
(cherry picked from commit a27188886a)
2020-08-11 12:34:26 +03:00
Rafael Ávila de Espíndola
ac207c892b build: Link with abseil
It is a pity we have to list so many libraries, but abseil doesn't
provide a .pc file.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 7d1f6725dd)

Ref #6190.
2020-08-11 12:34:26 +03:00
Rafael Ávila de Espíndola
a023b3bb7a Add abseil as a submodule
This adds the https://abseil.io library as a submodule. The patch
series that follows needs a hash table that supports heterogeneous
lookup, and abseil has a really good hash table that supports that
(https://abseil.io/blog/20180927-swisstables).

The library is still not available in Fedora, but it is fairly easy to
use it directly from a submodule.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 383a9c6da9)

Ref #6190
2020-08-11 12:34:26 +03:00
Rafael Ávila de Espíndola
0b9db42d9c cofigure: Don't overwrite seastar_cflags
The variable seastar_cflags was being used for flags passed to seastar
and for flags extracted from the seastar.pc file.

This introduces a new variable for the flags extracted from the
seastar.pc file.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
(cherry picked from commit 2ad09aefb6)

Ref #6190.
2020-08-11 12:34:26 +03:00
Calle Wilund
df8d4482c5 database: Do not assert on replay positions if truncate does not flush
Fixes #6995

In c2c6c71 the assert on replay positions in flushed sstables discarded by
truncate was broken, by the fact that we no longer flush all sstables
unless auto snapshot is enabled.

This means the low_mark assertion does not hold, because we maybe/probably
never got around to creating the sstables that would hold said mark.

Note that the (old) change to not create sstables and then just delete
them is in itself good. But in that case we should not try to verify
the rp mark.

(cherry picked from commit 9620755c7f)
2020-08-10 23:33:39 +03:00
Avi Kivity
442d7bf9ff Update seastar submodule
* seastar c9c1dc5fa7...1337f1158b (1):
  > memory: fix small aligned free memory corruption

Fixes #6831
2020-08-09 18:37:32 +03:00
Avi Kivity
bc6422d16d Merge 'hinted handoff: fix commitlog memory leak' from Piotr D
"
When commitlog is recreated in hints manager, only shutdown() method is
called, but not release(). Because of that, some internal commitlog
objects (`segment_manager` and `segment`s) may be left pointing to each
other through shared_ptr reference cycles, which may result in memory
leak when the parent commitlog object is destroyed.

This PR prevents memory leaks that may happen this way by calling
release() after shutdown() from the hints manager.

Fixes: #6409, Fixes #6776
"

* piodul-fix-commitlog-memory-leak-in-hinted-handoff:
  hinted handoff: disable warnings about segments left on disk
  hinted handoff: release memory on commitlog termination

(cherry picked from commit 4c221855a1)
2020-08-09 17:25:57 +03:00
Yaron Kaikov
76f4bc4c6f release: prepare for 4.1.4 2020-08-09 08:49:19 +03:00
Tomasz Grabiec
dc4efb0a1e thrift: Fix crash on unsorted column names in SlicePredicate
The column names in SlicePredicate can be passed in arbitrary order.
We converted them to clustering ranges in read_command preserving the
original order. As a result, the clustering ranges in read command may
appear out of order. This violates storage engine's assumptions and
lead to undefined behavior.

It was seen manifesting as a SIGSEGV or an abort in sstable reader
when executing a get_slice() thrift verb:

scylla: sstables/consumer.hh:476: seastar::future<> data_consumer::continuous_data_consumer<StateProcessor>::fast_forward_to(size_t, size_t) [with StateProcessor = sstables::data_consume_rows_context_m; size_t = long unsigned int]: Assertion `end >= _stream_position.position' failed.

Fixes #6486.

Tests:

   - added a new dtest to thrift_tests.py which reproduces the problem

Message-Id: <1596725657-15802-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit bfd129cffe)
2020-08-08 19:48:25 +03:00
Rafael Ávila de Espíndola
f699d23f0b alternator: Fix use after return
Avoid a copy of timeout so that we don't end up with a reference to a
stack allocated variable.

Fixes #6897

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200721184939.111665-1-espindola@scylladb.com>
(cherry picked from commit e83e91e352)
2020-08-03 22:36:37 +03:00
Nadav Har'El
d5e5a6fe48 alternator: fix Expected's "NULL" operator with missing AttributeValueList
The "NULL" operator in Expected (old-style conditional operations) doesn't
have any parameters, so we insisted that the AttributeValueList be empty.
However, we forgot to allow it to also be missing - a possibility which
DynamoDB allows.

This patch adds a test to reproduce this case (the test passes on DyanmoDB,
fails on Alternator before this patch, and succeeds after this patch), and
a fix.

Fixes #6816.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200709161254.618755-1-nyh@scylladb.com>
(cherry picked from commit f549d147ea)
2020-08-03 20:42:15 +03:00
Takuya ASADA
5a43c6ec81 scylla_util.py: always use relocatable CLI tools
On some CLI tools, command options may different between latest version
vs older version.
To maximize compatibility of setup scripts, we should always use
relocatable CLI tools instead of distribution version of the tool.

Related #6954

(cherry picked from commit a19a62e6f6)
2020-08-03 10:41:57 +03:00
Takuya ASADA
2aae8bb206 create-relocatable-package.py: add lsblk for relocatable CLI tools
We need latest version of lsblk that supported partition type UUID.

Fixes #6954

(cherry picked from commit 6ba2a6c42e)
2020-08-03 10:41:52 +03:00
Juliusz Stasiewicz
c206399379 aggregate_fcts: Use per-type comparators for dynamic types
For collections and UDTs the `MIN()` and `MAX()` functions are
generated on the fly. Until now they worked by comparing just the
byte representations of arguments.

This patch uses specific per-type comparators to provide semantically
sensible, dynamically created aggregates.

Fixes #6768

(cherry picked from commit 5b438e79be)
2020-08-03 10:26:15 +03:00
Calle Wilund
787b324916 cql3::lists: Fix setter_by_uuid not handing null value
Fixes #6828

When using the scylla list index from UUID extension,
null values were not handled properly causing throws
from underlying layer.

(cherry picked from commit 3b74b9585f)
2020-08-03 10:20:14 +03:00
Takuya ASADA
dfe90a69f5 scylla_post_install.sh: generate memory.conf for CentOS7
On CentOS7, systemd does not support percentage-based parameter.
To apply memory parameter on CentOS7, we need to override the parameter
in bytes, instead of percentage.

Fixes #6783

(cherry picked from commit 3a25e7285b)
2020-07-30 16:41:24 +03:00
Tomasz Grabiec
d03d6f41c2 commitlog: Fix use-after-free on mutation object during replay
The mutation object may be freed prematurely during commitlog replay
in the schema upgrading path. We will hit the problem if the memtable
is full and apply_in_memory() needs to defer.

This will typically manifest as a segfault.

Fixes #6953

Introduced in 79935df

Tests:
  - manual using scylla binary. Reproduced the problem then verified the fix makes it go away

Message-Id: <1596044010-27296-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit 3486eba1ce)
2020-07-30 16:36:55 +03:00
Avi Kivity
0e86f1bf66 dist: debian: do not require root during package build
Debian package builds provide a root environment for the installation
scripts, since that's what typical installation scripts expect. To
avoid providing actual root, a "fakeroot" system is used where syscalls
are intercepted and any effect that requires root (like chown) is emulated.

However, fakeroot sporadically fails for us, aborting the package build.
Since our install scripts don't really require root (when operating in
the --packaging mode), we can just tell dpkg-buildpackage that we don't
need fakeroot. This ought to fix the sporadic failures.

As a side effect, package builds are faster.

Fixes #6655.

(cherry picked from commit b608af870b)
2020-07-29 16:03:33 +03:00
Takuya ASADA
392a007b3a scylla_setup: skip boot partition
On GCE, /dev/sda14 reported as unused disk but it's BIOS boot partition,
should not use for scylla data partition, also cannot use for it since it's
too small.

It's better to exclude such partiotion from unsed disk list.

Fixes #6636

(cherry picked from commit d7de9518fe)
2020-07-29 09:50:19 +03:00
Asias He
254b898cd8 repair: Fix race between create_writer and wait_for_writer_done
We saw scylla hit user after free in repair with the following procedure during tests:

- n1 and n2 in the cluster

- n2 ran decommission

- n2 sent data to n1 using repair

- n2 was killed forcely

- n1 tried to remove repair_meta for n1

- n1 hit use after free on repair_meta object

This was what happened on n1:

1) data was received -> do_apply_rows was called -> yield before create_writer() was called

2) repair_meta::stop() was called -> wait_for_writer_done() / do_wait_for_writer_done was called
   with _writer_done[node_idx] not engaged

3) step 1 resumed, create_writer() was called and _repair_writer object was referenced

4) repair_meta::stop() finished, repair_meta object and its member _repair_writer was destroyed

5) The fiber created by create_writer() at step 3 hit use after free on _repair_writer object

To fix, we should call wait_for_writer_done() after any pending
operations were done which were protected by repair_meta::_gate. This
prevents wait for writer done finishes before the writer is in the
process of being created.

Fixes: #6853
Fixes: #6868
Backports: 4.0, 4.1, 4.2
(cherry picked from commit e6f640441a)
2020-07-29 09:50:15 +03:00
Raphael S. Carvalho
6fb84ed7e0 sstable: index_reader: Make sure streams are all properly closed on failure
Turns out the fix f591c9c710 wasn't enough to make sure all input streams
are properly closed on failure.
It only closes the main input stream that belongs to context, but it misses
all the input streams that can be opened in the consumer for promote index
reading. Consumer stores a list of indexes, where each of them has its own
input stream. On failure, we need to make sure that every single one of
them is properly closed before destroying the indexes as that could cause
memory corruption due to read ahead.

Fixes #6924.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200727182214.377140-1-raphaelsc@scylladb.com>
(cherry picked from commit 0d70efa58e)
2020-07-29 09:48:48 +03:00
Yaron Kaikov
9002592ee0 release: prepare for 4.1.3 2020-07-29 08:26:06 +03:00
Botond Dénes
5d6a7272e7 sstables: clamp estimated_partitions to [1, +inf) in writers
In some cases estimated number of partitions can be 0, which is albeit a
legit estimation result, breaks many low-level sstable writer code, so
some of these have assertions to ensure estimated partitions is > 0.
To avoid hitting this assert all users of the sstable writers do the
clamping, to ensure estimated partitions is at least 1. However leaving
this to the callers is error prone as #6913 has shown it. As this
clamping is standard practice, it is better to do it in the writers
themselves, avoiding this problem altogether. This is exactly what this
patch does. It also adds two unit tests, one that reproduces the crash
in #6913, and another one that ensures all sstable writers are fine with
estimated partitions being 0 now. Call sites previously doing the
clamping are changed to not do it, it is unnecessary now as the writer
does it itself.

Fixes #6913

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200724120227.267184-1-bdenes@scylladb.com>
[avi: adjust sstable_datafile_test's use of compaction_descriptor and make_permit]
(cherry picked from commit fe127a2155)
2020-07-28 09:55:34 +03:00
Piotr Sarna
96625fa54b Merge 'view_update_generator: use partitioned sstable set'
from Botond.

Recently it was observed (#6603) that since 4e6400293ea, the staging
reader is reading from a lot of sstables (200+). This consumes a lot of
memory, and after this reaches a certain threshold -- the entire memory
amount of the streaming reader concurrency semaphore -- it can cause a
deadlock within the view update generation. To reduce this memory usage,
we exploit the fact that the staging sstables are usually disjoint, and
use the partitioned sstable set to create the staging reader. This
should ensure that only the minimum number of sstable readers will be
opened at any time.

Refs: #6603
Fixes: #6707

Tests: unit(dev)

* 'view-update-generator-use-partitioned-set/v1' of https://github.com/denesb/scylla:
  db/view: view_update_generator: use partitioned sstable set
  sstables: make_partitioned_sstable_set(): return an sstable_set

(cherry picked from commit e4b74356bb)
2020-07-21 15:41:46 +03:00
Raphael S. Carvalho
4f5f404619 table: Fix Staging SSTables being incorrectly added or removed from the backlog tracker
Staging SSTables can be incorrectly added or removed from the backlog tracker,
after an ALTER TABLE or TRUNCATE, because the add and removal don't take
into account if the SSTable requires view building, so a Staging SSTable can
be added to the tracker after a ALTER table, or removed after a TRUNCATE,
even though not added previously, potentially causing the backlog to
become negative.

Fixes #6798.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200716180737.944269-1-raphaelsc@scylladb.com>
(cherry picked from commit b67066cae2)
2020-07-21 12:57:28 +03:00
Asias He
cd4502ee64 repair: Relax size check of get_row_diff and set_diff
In case a row hash conflict, a hash in set_diff will get more than one
row from get_row_diff.

For example,

Node1 (Repair master):
row1  -> hash1
row2  -> hash2
row3  -> hash3
row3' -> hash3

Node2 (Repair follower):
row1  -> hash1
row2  -> hash2

We will have set_diff = {hash3} between node1 and node2, while
get_row_diff({hash3}) will return two rows: row3 and row3'. And the
error below was observed:

   repair - Got error in row level repair: std::runtime_error
   (row_diff.size() != set_diff.size())

In this case, node1 should send both row3 and row3' to peer node
instead of fail the whole repair. Because node2 does not have row3 or
row3', otherwise node1 won't send row with hash3 to node1 in the first
place.

Refs: #6252
(cherry picked from commit a00ab8688f)
2020-07-15 14:49:09 +03:00
Hagit Segev
3e6c6d5f58 release: prepare for 4.1.2 2020-07-14 23:56:02 +03:00
Avi Kivity
564b4c32b0 Update seastar submodule
* seastar 78f626af6c...c9c1dc5fa7 (2):
  > futures: Add a test for a broken promise in a parallel_for_each
  > future: Call set_to_broken_promise earlier

Fixes #6749 (probably).
2020-07-13 20:17:54 +03:00
Dmitry Kropachev
dfafc4e1a9 dist/common/scripts/scylla-housekeeping: wrap urllib.request with try ... except
We could hit "cannot serialize '_io.BufferedReader' object" when request get 404 error from the server
	Now you will get legit error message in the case.

	Fixes #6690

(cherry picked from commit de82b3efae)
2020-07-09 18:25:16 +03:00
Dejan Mircevski
db286c5ca4 cql/restrictions: Handle WHERE a>0 AND a<0
WHERE clauses with start point above the end point were handled
incorrectly.  When the slice bounds are transformed to interval
bounds, the resulting interval is interpreted as wrap-around (because
start > end), so it contains all values above 0 and all values below
0.  This is clearly incorrect, as the user's intent was to filter out
all possible values of a.

Fix it by explicitly short-circuiting to false when start > end.  Add
a test case.

Fixes #5799.

Tests: unit (dev)

Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
(cherry picked from commit 921dbd0978)
2020-07-08 13:21:00 +03:00
Botond Dénes
519fcd4729 db/view: view_update_generator: re-balance wait/signal on the register semaphore
The view update generator has a semaphore to limit concurrency. This
semaphore is waited on in `register_staging_sstable()` and later the
unit is returned after the sstable is processed in the loop inside
`start()`.
This was broken by 4e64002, which changed the loop inside `start()` to
process sstables in per table batches, however didn't change the
`signal()` call to return the amount of units according to the number of
sstables processed. This can cause the semaphore units to dry up, as the
loop can process multiple sstables per table but return just a single
unit. This can also block callers of `register_staging_sstable()`
indefinitely as some waiters will never be released as under the right
circumstances the units on the semaphore can permanently go below 0.
In addition to this, 4e64002 introduced another bug: table entries from
the `_sstables_with_tables` are never removed, so they are processed
every turn. If the sstable list is empty, there won't be any update
generated but due to the unconditional `signal()` described above, this
can cause the units on the semaphore to grow to infinity, allowing
future staging sstables producers to register a huge amount of sstables,
causing memory problems due to the amount of sstable readers that have
to be opened (#6603, #6707).
Both outcomes are equally bad. This patch fixes both issues and modifies
the `test_view_update_generator` unit test to reproduce them and hence
to verify that this doesn't happen in the future.

Fixes: #6774
Refs: #6707
Refs: #6603

Tests: unit(dev)
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200706135108.116134-1-bdenes@scylladb.com>
(cherry picked from commit 5ebe2c28d1)
2020-07-08 12:00:12 +03:00
Juliusz Stasiewicz
9bcbcbbcf2 counters: Read the state under timeout
Counter update is a RMW operation. Until now the "Read" part was
not guarded by a timeout, which is changed in this patch.

Fixes #5069

(cherry picked from commit e04fd9f774)
2020-07-07 20:45:01 +03:00
Takuya ASADA
c622e5bfab scylla_setup: don't add same disk device twice
We shouldn't accept adding same disk twice for RAID prompt.

Fixes #6711

(cherry picked from commit 835e76fdfc)
2020-07-07 13:08:22 +03:00
Nadav Har'El
905643bbc2 docker: add option to start Alternator with HTTPS
We already have a docker image option to enable alternator on an unencrypted
port, "--alternator-port", but we forgot to also allow the similar option
for enabling alternator on an encrypted (HTTPS) port: "--alternator-https-port"
so this patch adds the missing option, and documents how to use it.

Note that using this option is not enough. When this option is used,
Alternator also requires two files, /etc/scylla/scylla.crt and
/etc/scylla/scylla.key, to be inserted into the image. These files should
contain the SSL certificate, and key, respectively. If these files are
missing, you will get an error in the log about the missing file.

Fixes #6583.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200621125219.12274-1-nyh@scylladb.com>
(cherry picked from commit e4eca5211a)
2020-07-06 08:22:22 +02:00
Juliusz Stasiewicz
d396a298d6 cdc: Fix segfault when stream ID key is too short
When a token is calculated for stream_id, we check that the key is
exactly 16 bytes long. If it's not - `minimum_token` is returned
and client receives empty result.

This used to be the expected behavior for empty keys; now it's
extended to keys of any incorrect length.

Fixes #6570

(cherry picked from commit 8628ede009)
2020-07-05 15:09:44 +03:00
Asias He
1d9bbbc957 boot_strapper: Ignore node to be replaced explicitly as stream source
After commit 7d86a3b208 (storage_service:
Make replacing node take writes), during replace operation, tokens in
_token_metadata for node being replaced are updated only after the replace
operation is finished. As a result, in range_streamer::add_ranges, the
node being replaced will be considered as a source to stream data from.

Before commit 7d86a3b208, the node being
replaced will not be considered as a source node because it is already
replaced by the replacing node before the replace operation is finished.
This is the reason why it works in the past.

To fix, filter out the node being replaced as a source node explicitly.

Tests: replace_first_boot_test and replace_stopped_node_test
Backports: 4.1
Fixes: #6728
(cherry picked from commit e338028b7e22b0a80be7f80c337c52f958bfe1d7)
2020-07-01 14:35:28 +03:00
Raphael S. Carvalho
4f1878803e compaction: Fix the 2x disk space requirement in SSTable upgrade
SSTable upgrade is requiring 2x the space of input SSTables because
we aren't releasing references of the SSTables that were already
upgraded. So if we're upgrading 1TB, it means that up to 2TB may be
required for the upgrade operation to succeed.

That can be fixed by moving all input SSTables when rewrite_sstables()
asks for the set of SSTables to be compacted, so allowing their space
to be released as soon as there is no longer any ref to them.

Spotted while auditting code.

Fixes #6682.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200619205701.92891-1-raphaelsc@scylladb.com>
(cherry picked from commit 52180f91d4)
2020-07-01 12:36:52 +03:00
Avi Kivity
c5e2fad1c8 Merge "Fix handling of decimals with negative scales" from Rafael
"
Before this series scylla would effectively infinite loop when, for
example, casting a decimal with a negative scale to float.

Fixes #6720
"

* 'espindola/fix-decimal-issue' of https://github.com/espindola/scylla:
  big_decimal: Add a test for a corner case
  big_decimal: Correctly handle negative scales
  big_decimal: Add a as_rational member function
  big_decimal: Move constructors out of line

(cherry picked from commit 3e2eeec83a)
2020-06-29 12:05:39 +03:00
Hagit Segev
abd0fa52c0 release: prepare for 4.1.1 2020-06-25 08:06:32 +03:00
Piotr Sarna
dfa464c35b alternator: fix propagating tags
Updating tags was erroneously done locally, which means that
the schema change was not propagated to other nodes.
The new code announces new schema globally.

Fixes #6513
Branches: 4.0,4.1
Tests: unit(dev)
       dtest(alternator_tests.AlternatorTest.test_update_condition_expression_and_write_isolation)
Message-Id: <3a816c4ecc33c03af4f36e51b11f195c231e7ce1.1592935039.git.sarna@scylladb.com>

(cherry picked from commit f4e8cfe03b)
2020-06-24 13:56:09 +03:00
Avi Kivity
be29b35c4b Merge 'range_streamer: Handle table of RF 1 in get_range_fetch_map' from Asias
"
After "Make replacing node take writes" series, with repair based node
operations disabled, we saw the replace operation fail like:

```
[shard 0] init - Startup failed: std::runtime_error (unable to find
sufficient sources for streaming range (9203926935651910749, +inf) in
keyspace system_auth)
```
The reason is the system_auth keyspace has default RF of 1. It is
impossible to find a source node to stream from for the ranges owned by
the replaced node.

In the past, the replace operation with keyspace of RF 1 passes, because
the replacing node calls token_metadata.update_normal_tokens(tokens,
ip_of_replacing_node) before streaming. We saw:

```
[shard 0] range_streamer - Bootstrap : keyspace system_auth range
(-9021954492552185543, -9016289150131785593] exists on {127.0.0.6}
```

Node 127.0.0.6 is the replacing node 127.0.0.5. The source node check in
range_streamer::get_range_fetch_map will pass if the source is the node
itself. However, it will not stream from the node itself. As a result,
the system_auth keyspace will not get any data.

After the "Make replacing node take writes" series, the replacing node
calls token_metadata.update_normal_tokens(tokens, ip_of_replacing_node)
after the streaming finishes. We saw:

```
[shard 0] range_streamer - Bootstrap : keyspace system_auth range
(-9049647518073030406, -9048297455405660225] exists on {127.0.0.5}
```

Since 127.0.0.5 was dead, the source node check failed, so the bootstrap
operation.

Ta fix, we ignore the table of RF 1 when it is unable to find a source
node to stream.

Fixes #6351
"

* asias-fix_bootstrap_with_rf_one_in_range_streamer:
  range_streamer: Handle table of RF 1 in get_range_fetch_map
  streaming: Use separate streaming reason for replace operation

(cherry picked from commit 9afd599d7c)
2020-06-23 13:53:03 +03:00
Asias He
97b7024c0c streaming: Do not send end of stream in case of error
Current sender sends stream_mutation_fragments_cmd::end_of_stream to
receiver when an error is received from a peer node. To be safe, send
stream_mutation_fragments_cmd::error instead of
stream_mutation_fragments_cmd::end_of_stream to prevent end_of_stream to
be written into the sstable when a partition is not closed yet.

In addition, use mutation_fragment_stream_validator to valid the
mutation fragments emitted from the reader, e.g., check if
partition_start and partition_end are paired when the reader is done. If
not, fail the stream session and send
stream_mutation_fragments_cmd::error instead of
stream_mutation_fragments_cmd::end_of_stream to isolate the problematic
sstables on the sender node.

Refs: #6478
(cherry picked from commit a521c429e1)
2020-06-23 12:47:35 +03:00
Alejo Sanchez
194ff1d226 lwt: validate before constructing metadata
LWT batches conditions can't span multiple tables.
This was detected in batch_statement::validate() called in ::prepare().
But ::cas_result_set_metadata() was built in the constructor,
causing a bitset assert/crash in a reported scenario.
This patch moves validate() to the constructor before building metadata.

Closes #6332

Tested with https://github.com/scylladb/scylla-dtest/pull/1465

[avi: adjust spelling of exception message to 4.1 spelling]

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
(cherry picked from commit d1521e6721)
2020-06-21 18:20:41 +03:00
Gleb Natapov
b8f7fb35e1 cql transport: do not log broken pipe error when a client closes its side of a connection abruptly
Fixes #5661

Message-Id: <20200615075958.GL335449@scylladb.com>
(cherry picked from commit 7ca937778d)
2020-06-21 13:08:58 +03:00
Amnon Heiman
f7d53ff607 api/storage_service.cc: stream result of token_range
The get token range API can become big which can cause large allocation
and stalls.

This patch replace the implementation so it would stream the results
using the http stream capabilities instead of serialization and sending
one big buffer.

Fixes #6297

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 7c4562d532)
2020-06-21 12:57:15 +03:00
Rafael Ávila de Espíndola
eb190643f8 configure: Reduce the dynamic linker path size
gdb has a SO_NAME_MAX_PATH_SIZE of 512, so we use that as the path
size.

Fixes: #6494

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200528202741.398695-2-espindola@scylladb.com>
(cherry picked from commit aa778ec152)
2020-06-21 12:26:51 +03:00
Piotr Sarna
3f8345f1b8 alternator: fix the return type of PutItem
Even if there are no attributes to return from PutItem requests,
we should return a valid JSON object, not an empty string.

Fixes #6568
Tests: unit(dev)

(cherry picked from commit 8fc3ca855e)
2020-06-21 12:21:19 +03:00
Piotr Sarna
891a3fa243 alternator: fix returning UnprocessedKeys unconditionally
Client libraries (e.g. PynamoDB) expect the UnprocessedKeys
and UnprocessedItems attributes to appear in the response
unconditionally - it's hereby added, along with a simple test case.

Fixes #6569
Tests: unit(dev)

(cherry picked from commit 3aff52f56e)
2020-06-21 12:19:18 +03:00
Tomasz Grabiec
db31542805 row_cache: Fix undefined behavior on key linearization
This is relevant only when using partition or clustering keys which
have a representation in memory which is larger than 12.8 KB (10% of
LSA segment size).

There are several places in code (cache, background garbage
collection) which may need to linearize keys because of performing key
comparison, but it's not done safely:

 1) the code does not run with the LSA region locked, so pointers may
get invalidated on linearization if it needs to reclaim memory. This
is fixed by running the code inside an allocating section.

 2) LSA region is locked, but the scope of
with_linearized_managed_bytes() encloses the allocating section. If
allocating section needs to reclaim, linearization context will
contain invalidated pointers. The fix is to reorder the scopes so
that linearization context lives within an allocating section.

Example of 1 can be found in
range_populating_reader::handle_end_of_stream() where it performs a
lookup:

  auto prev = std::prev(it);
  if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
     it->set_continuous(true);

but handle_end_of_stream() is not invoked under allocating section.

Example of 2 can be found in mutation_cleaner_impl::merge_some() where
it does:

  return with_linearized_managed_bytes([&] {
  ...
    return _worker_state->alloc_section(region, [&] {

Fixes #6637.
Refs #6108.

Tests:

  - unit (all)

Message-Id: <1592218544-9435-1-git-send-email-tgrabiec@scylladb.com>
(cherry picked from commit e81fc1f095)
2020-06-21 11:56:31 +03:00
Yaron Kaikov
b443b2574a release: prepare for 4.1.0 2020-06-18 14:42:57 +03:00
Asias He
2ee321d88e gossip: Do not send shutdown message when a node is in unknown status
When a replacing node is in early boot up and is not in HIBERNATE sate
yet, if the node is killed by a user, the node will wrongly send a
shutdown message to other nodes. This is because UNKNOWN is not in
SILENT_SHUTDOWN_STATES, so in gossiper::do_stop_gossiping, the node will
send shutdown message. Other nodes in the cluster will call
storage_service::handle_state_normal for this node, since NORMAL and
SHUTDOWN status share the same status handler. As a result, other nodes
will incorrectly think the node is part of the cluster and the replace
operation is finished.

Such problem was seen in replace_node_no_hibernate_state_test dtest:

   n1, n2 are in the cluster
   n2 is dead
   n3 is started to replace n2, but n3 is killed in the middle
   n3 announces SHUTDOWN status wrongly
   n1 runs storage_service::handle_state_normal for n3
   n1 get tokens for n3 which is empty, because n3 hasn't gossip tokens yet
   n1 skips update normal tokens for n3,  but think n3 has replaced n2
   n4 starts to replace n2
   n4 checks the tokens for n2 in storage_service::join_token_ring (Cannot
      replace token {} which does not exist!) or
      storage_service::prepare_replacement_info (Cannot replace_address {}
      because it doesn't exist in gossip)

To fix, we add UNKNOWN into SILENT_SHUTDOWN_STATES and avoid sending
shutdown message.

Tests: replace_address_test.py:TestReplaceAddress.replace_node_no_hibernate_state_test
Fixes: #6436
(cherry picked from commit dddde33512)
2020-06-16 15:03:48 +03:00
Avi Kivity
4563f4b992 tools: toolchain: regenerate for gnutls 3.6.14
CVE-2020-13777.

Fixes #6627.

Toolchain source image registry disambiguated due to tighter podman defaults.
2020-06-15 07:49:21 +03:00
Kamil Braun
81dc8eeec7 cdc: rename CDC description tables
Commit 968177da04 has changed the schema
of cdc_topology_description and cdc_description tables in the
system_distributed keyspace.

Unfortunately this was a backwards-incompatible change: these tables
would always be created, irrespective of whether or not "experimental"
was enabled. They just wouldn't be populated with experimental=off.

If the user now tries to upgrade Scylla from a version before this change
to a version after this change, it will work as long as CDC is protected
b the experimental flag and the flag is off.

However, if we drop the flag, or if the user turns experimental on,
weird things will happen, such as nodes refusing to start because they
try to populate cdc_topology_description while assuming a different schema
for this table.

The simplest fix for this problem is to rename the tables. This fix must
get merged in before CDC goes out of experimental.
If the user upgrades his cluster from a pre-rename version, he will simply
have two garbage tables that he is free to delete after upgrading.

sstables and digests need to be regenerated for schema_digest_test since
this commit effectively adds new tables to the system_distributed keyspace.
This doesn't result in schema disagreement because the table is
announced to all nodes through the migration manager.

(cherry picked from commit d89b7a0548)
Fixes #6537.
2020-06-14 09:15:36 +03:00
Raphael S. Carvalho
2d72f7d8e5 compaction: Disable garbage collected writer if interposer consumer is used
GC writer, used for incremental compaction, cannot be currently used if interposer
consumer is used. That's because compaction assumes that GC writer will be operated
only by a single compaction writer at a given point in time.
With interposer consumer, multiple writers will concurrently operate on the same
GC writer, leading to race condition which potentially result in use-after-free.

Let's disable GC writer if interposer consumer is enabled. We're not losing anything
because GC writer is currently only needed on strategies which don't implement an
interposer consumer. Resharding will always disable GC writer, which is the expected
behavior because it doesn't support incremental compaction yet.
The proper fix, which allows GC writer and interposer consumer to work together,
will require more time to implement and test, and for that reason, I am postponing
it as #6472 is a showstopper for the current release.

Fixes #6472.

tests: mode(dev).

[Raphael: Fixed compilation failure in unit test test_bug_6472 for backport]

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Reviewed-by: Glauber Costa <glauber@scylladb.com>
(cherry picked from commit 097a5e9e07)
Message-Id: <20200610203928.86717-1-raphaelsc@scylladb.com>
2020-06-11 13:21:56 +03:00
Takuya ASADA
c6ee86b512 aws: update enhanced networking supported instance list
Sync enhanced networking supported instance list to latest one.

Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html

Fixes #6540

(cherry picked from commit 969c4258cf)
2020-06-09 16:02:09 +03:00
Hagit Segev
67348cd6e8 release: prepare for 4.1.rc2 2020-06-08 16:37:36 +03:00
Israel Fruchter
44cc4843f1 fix "scylla_coredump_setup: Remove the coredump create by the check"
In 28c3d4 `out()` was used without `shell=True` and was the spliting of arguments
failed cause of the complex commands in the cmd (pipe and such)

Fixes #6159

(cherry picked from commit a2bb48f44b)
2020-06-04 20:54:51 +03:00
Israel Fruchter
f1f5586bf6 scylla_coredump_setup: Remove the coredump create by the check
We generate a coredump as part of "scylla_coredump_setup" to verify that
coredumps are working. However, we need to *remove* that test coredump
to avoid people and test infrastructure reporting those coredumps.

Fixes #6159

(cherry picked from commit 28c3d4f8e8)
2020-06-03 16:52:51 +03:00
Amos Kong
3a447cd755 active the coredump directory mount during coredump setup
Currently we use a systemd mount (var-lib-systemd-coredump.mount) to mount
default coredump directory (/var/lib/systemd/coredump) to
(/var/lib/scylla/coredump). The /var/lib/scylla had been mounted to a big
storage, so we will have enough space for coredump after the mount.

Currently in coredump_setup, we only enabled var-lib-systemd-coredump.mount,
but not start it. The directory won't be mounted after coredump_setup, so the
coredump will still be saved to default coredump directory.
The mount will only effect after reboot.

Fixes #6566

(cherry picked from commit abf246f6e5)
2020-06-03 09:25:59 +03:00
Pekka Enberg
176aa91be5 Revert "scylla_coredump_setup: Fix incorrect coredump directory mount"
This reverts commit e77dad3adf because its
incorrect.

Amos explains:

"Quote from https://www.freedesktop.org/software/systemd/man/systemd.mount.html

 What=

   Takes an absolute path of a device node, file or other resource to
   mount. See mount(8) for details. If this refers to a device node, a
   dependency on the respective device unit is automatically created.

 Where=

   Takes an absolute path of a file or directory for the mount point; in
   particular, the destination cannot be a symbolic link. If the mount
   point does not exist at the time of mounting, it is created as
   directory.

 So the mount point is '/var/lib/systemd/coredump' and
 '/var/lib/scylla/coredump' is the file to mount, because /var/lib/scylla
 had mounted a second big storage, which has enough space for Huge
 coredumps.

 Bentsi or other touched problem with old scylla-master AMI, a coredump
 occurred but not successfully saved to disk for enospc.  The directory
 /var/lib/systemd/coredump wasn't mounted to /var/lib/scylla/coredump.
 They WRONGLY thought the wrong mount was caused by the config problem,
 so he posted a fix.

 Actually scylla-ami-setup / coredump wasn't executed on that AMI, err:
 unit scylla-ami-setup.service not found Because
 'scylla-ami-setup.service' config file doesn't exist or is invalid.

 Details of my testing: https://github.com/scylladb/scylla/issues/6300#issuecomment-637324507

 So we need to revert Bentsi's patch, it changed the right config to wrong."

(cherry picked from commit 9d9d54c804)
2020-06-03 09:25:49 +03:00
Avi Kivity
4a3eff17ff Revert "Revert "config: Do not enable repair based node operations by default""
This reverts commit 71d0d58f8c. Repair-based
node operations are still not ready.
2020-06-02 18:08:03 +03:00
Nadav Har'El
2e00f6d0a1 alternator: fix support for bytes type in Query's KeyConditions
Our parsing of values in a KeyConditions paramter of Query was done naively.
As a result, we got bizarre error messages "condition not met: false" when
these values had incorrect type (this is issue #6490). Worse - the naive
conversion did not decode base64-encoded bytes value as needed, so
KeyConditions on bytes-typed keys did not work at all.

This patch fixes these bugs by using our existing utility function
get_key_from_typed_value(), which takes care of throwing sensible errors
when types don't match, and decoding base64 as needed.

Unfortunately, we didn't have test coverage for many of the KeyConditions
features including bytes keys, which is why this issue escaped detection.
A patch will follow with much more comprehensive tests for KeyConditions,
which also reproduce this issue and verify that it is fixed.

Refs #6490
Fixes #6495

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200524141800.104950-1-nyh@scylladb.com>
(cherry picked from commit 6b38126a8f)
2020-05-31 13:53:45 +03:00
Nadav Har'El
bf509c3b16 alternator: add mandatory configurable write isolation mode
Alternator supports four ways in which write operations can use quorum
writes or LWT or both, which we called "write isolation policies".

Until this patch, Alternator defaulted to the most generally safe policy,
"always_use_lwt". This default could have been overriden for each table
separately, but there was no way to change this default for all tables.
This patch adds a "--alternator-write-isolation" configuration option which
allows changing the default.

Moreover, @dorlaor asked that users must *explicitly* choose this default
mode, and not get "always_use_lwt" without noticing. The previous default,
"always_use_lwt" supports any workload correctly but because it uses LWT
for all writes it may be disappointingly slow for users who run write-only
workloads (including most benchmarks) - such users might find the slow
writes so disappointing that they will drop Scylla. Conversely, a default
of "forbid_rmw" will be faster and still correct, but will fail on workloads
which need read-modify-write operations - and suprise users that need these
operations. So Dor asked that that *none* of the write modes be made the
default, and users must make an informed choice between the different write
modes, rather than being disappointed by a default choice they weren't
aware of.

So after this patch, Scylla refuses to boot if Alternator is enabled but
a "--alternator-write-isolation" option is missing.

The patch also modifies the relevant documentation, adds the same option to
our docker image, and the modifies the test-running script
test/alternator/run to run Scylla with the old default mode (always_use_lwt),
which we need because we want to test RMW operations as well.

Fixes #6452

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200524160338.108417-1-nyh@scylladb.com>
(cherry picked from commit c3da9f2bd4)
2020-05-31 13:42:11 +03:00
Avi Kivity
84ef30752f Update seastar submodule
* seastar e708d1df3a...78f626af6c (1):
  > reactor: don't mlock all memory at once

Fixes #6460.
2020-05-31 13:34:42 +03:00
Avi Kivity
f1b71ec216 Point seastar submodule at scylla-seastar.git
This allows us to backport seastar patches to the 4.1 branch.
2020-05-31 13:34:42 +03:00
Piotr Sarna
93ed536fba alternator: wait for schema agreement after table creation
In order to be sure that all nodes acknowledged that a table was
created, the CreateTable request will now only return after
seeing that schema agreement was reached.
Rationale: alternator users check if the table was created by issuing
a DescribeTable request, and assume that the table was correctly
created if it returns nonempty results. However, our current
implementation of DescribeTable returns local results, which is
not enough to judge if all the other nodes acknowledge the new table.
CQL drivers are reported to always wait for schema agreement after
issuing DDL-changing requests, so there should be no harm in waiting
a little longer for alternator's CreateTable as well.

Fixes #6361
Tests: alternator(local)

(cherry picked from commit 5f2eadce09)
2020-05-31 13:18:11 +03:00
Nadav Har'El
ab3da4510c docs, alternator: improve description of status of global tables support
The existing text did not explain what happens if additional DCs are added
to the cluster, so this patch improves the explanation of the status of
our support for global tables, including that issue.

Fixes #6353

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200513175908.21642-1-nyh@scylladb.com>
(cherry picked from commit f3fd976120)
2020-05-31 13:13:13 +03:00
Asias He
bb8fcbff68 repair: Abort the queue in write_end_of_stream in case of error
In write_end_of_stream, it does:

1) Write write_partition_end
2) Write empty mutation_fragment_opt

If 1) fails, 2) will be skipped, the consumer of the queue will wait for
the empty mutation_fragment_opt forever.

Found this issue when injecting random exceptions between 1) and 2).

Refs #6272
Refs #6248

(cherry picked from commit b744dba75a)
2020-05-27 20:11:30 +03:00
Hagit Segev
af43d0c62d release: prepare for 4.1.rc1 2020-05-26 18:57:30 +03:00
Amnon Heiman
8c8c266f67 storage_service: get_range_to_address_map prevent use after free
The implementation of get_range_to_address_map has a default behaviour,
when getting an empty keypsace, it uses the first non-system keyspace
(first here is basically, just a keyspace).

The current implementation has two issues, first, it uses a reference to
a string that is held on a stack of another function. In other word,
there's a use after free that is not clear why we never hit.

The second, it calls get_non_system_keyspaces twice. Though this is not
a bug, it's redundant (get_non_system_keyspaces uses a loop, so calling
that function does have a cost).

This patch solves both issues, by chaning the implementation to hold a
string instead of a reference to a string.

Second, it stores the results from get_non_system_keyspaces and reuse
them it's more efficient and holds the returned values on the local
stack.

Fixes #6465

Signed-off-by: Amnon Heiman <amnon@scylladb.com>
(cherry picked from commit 69a46d4179)
2020-05-25 12:48:11 +03:00
Nadav Har'El
6d1301d93c alternator: better error messages when 'forbid_rmw' mode is on
When the 'forbid_rmw' write isolation policy is selected, read-modify-write
are intentionally forbidden. The error message in this case used to say:

	"Read-modify-write operations not supported"

Which can lead users to believe that this operation isn't supported by this
version of Alternator - instead of realizing that this is in fact a
configurable choice.

So in this patch we just change the error message to say:

	"Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information."

Fixes #6421.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200518125538.8347-1-nyh@scylladb.com>
(cherry picked from commit 5ef9854e86)
2020-05-25 08:49:48 +03:00
Tomasz Grabiec
be545d6d5d sstables: index_reader: Fix overflow when calculating promoted index end
When index file is larger than 4GB, offset calculation will overflow
uint32_t and _promoted_index_end will be too small.

As a result, promoted_index_size calculation will underflow and the
rest of the page will be interpretd as a promoted index.

The partitions which are in the remainder of the index page will not
be found by single-partition queries.

Data is not lost.

Introduced in 6c5f8e0eda.

Fixes #6040
Message-Id: <20200521174822.8350-1-tgrabiec@scylladb.com>

(cherry picked from commit a6c87a7b9e)
2020-05-24 09:45:42 +03:00
Rafael Ávila de Espíndola
a1c15f0690 repair: Make sure sinks are always closed
In a recent next failure I got the following backtrace

    function=function@entry=0x270360 "seastar::rpc::sink_impl<Serializer, Out>::~sink_impl() [with Serializer = netw::serializer; Out = {repair_row_on_wire_with_cmd}]") at assert.c:101
    at ./seastar/include/seastar/core/shared_ptr.hh:463
    at repair/row_level.cc:2059

This patch changes a few functions to use finally to make sure the sink
is always closed.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200515202803.60020-1-espindola@scylladb.com>
(cherry picked from commit 311fbe2f0a)

Ref #6414
2020-05-20 09:00:10 +03:00
Asias He
4d68c53389 repair: Fix race between write_end_of_stream and apply_rows
Consider: n1, n2, n1 is the repair master, n2 is the repair follower.

=== Case 1 ===
1) n1 sends missing rows {r1, r2} to n2
2) n2 runs apply_rows_on_follower to apply rows, e.g., {r1, r2}, r1
   is written to sstable, r2 is not written yet, r1 belongs to
   partition 1, r2 belongs to partition 2. It yields after row r1 is
   written.
   data: partition_start, r1
3) n1 sends repair_row_level_stop to n2 because error has happened on n1
4) n2 calls wait_for_writer_done() which in turn calls write_end_of_stream()
   data: partition_start, r1, partition_end
5) Step 2 resumes to apply the rows.
   data: partition_start, r1, partition_end, partition_end, partition_start, r2

=== Case 2 ===
1) n1 sends missing rows {r1, r2} to n2
2) n2 runs apply_rows_on_follower to apply rows, e.g., {r1, r2}, r1
   is written to sstable, r2 is not written yet, r1 belongs to partition
   1, r2 belongs to partition 2. It yields after partition_start for r2
   is written but before _partition_opened is set to true.
   data: partition_start, r1, partition_end, partition_start
3) n1 sends repair_row_level_stop to n2 because error has happened on n1
4) n2 calls wait_for_writer_done() which in turn calls write_end_of_stream().
   Since _partition_opened[node_idx] is false, partition_end is skipped,
   end_of_stream is written.
   data: partition_start, r1, partition_end, partition_start, end_of_stream

This causes unbalanced partition_start and partition_end in the stream
written to sstables.

To fix, serialize the write_end_of_stream and apply_rows with a semaphore.

Fixes: #6394
Fixes: #6296
Fixes: #6414
(cherry picked from commit b2c4d9fdbc)
2020-05-20 08:07:53 +03:00
Piotr Dulikowski
7d1f352be2 hinted handoff: don't keep positions of old hints in rps_set
When sending hints from one file, rps_set field in send_one_file_ctx
keeps track of commitlog positions of hints that are being currently
sent, or have failed to be sent. At the end of the operation, if sending
of some hints failed, we will choose position of the earliest hint that
failed to be sent, and will retry sending that file later, starting from
that position. This position is stored in _last_not_complete_rp.

Usually, this set has a bounded size, because we impose a limit of at
most 128 hints being sent concurrently. Because we do not attempt to
send any more hints after a failure is detected, rps_set should not have
more than 128 elements at a time.

Due to a bug, commitlog positions of old hints (older than
gc_grace_seconds of the destination table) were inserted into rps_set
but not removed after checking their age. This could cause rps_set to
grow very large when replaying a file with old hints.

Moreover, if the file mixed expired and non-expired hints (which could
happen if it had hints to two tables with different gc_grace_seconds),
and sending of some non-expired hints failed, then positions of expired
hints could influence calculation _last_not_complete_rp, and more hints
than necessary would be resent on the next retry.

This simple patch removes commitlog position of a hint from rps_set when
it is detected to be too old.

Fixes #6422

(cherry picked from commit 85d5c3d5ee)
2020-05-20 08:05:51 +03:00
Piotr Dulikowski
0fe5335447 hinted handoff: remove discarded hint positions from rps_set
Related commit: 85d5c3d

When attempting to send a hint, an exception might occur that results in
that hint being discarded (e.g. keyspace or table of the hint was
removed).

When such an exception is thrown, position of the hint will already be
stored in rps_set. We are only allowed to retain positions of hints that
failed to be sent and needed to be retried later. Dropping a hint is not
an error, therefore its position should be removed from rps_set - but
current logic does not do that.

Because of that bug, hint files with many discardable hints might cause
rps_set to grow large when the file is replayed. Furthermore, leaving
positions of such hints in rps_set might cause more hints than necessary
to be re-sent if some non-discarded hints fail to be sent.

This commit fixes the problem by removing positions of discarded hints
from rps_set.

Fixes #6433

(cherry picked from commit 0c5ac0da98)
2020-05-20 08:03:20 +03:00
Avi Kivity
8a026b8b14 Revert "compaction_manager: allow early aborts through abort sources."
This reverts commit e8213fb5c3. It results
in an assertion failure in remove_index_file_test.

Fixes #6413.

(cherry picked from commit 5b971397aa)
2020-05-13 18:26:34 +03:00
Yaron Kaikov
0760107b9f release: prepare for 4.1.rc0 2020-05-11 11:32:01 +03:00
Nadav Har'El
7da949026d doc, alternator: shorten description of "tags" compatibility
The "current compatibility with DynamoDB" section in alternator.md is where
we should list very briefly our state of compatibility - it's not the right
place to explain implementation details or track obscure bugs. I've
significantly shortened the "Tags" section because, in brief, we do
fully support tags and should say that we do.

I moved the two bugs mentioned in the text into the bug tracker:
Refs #6389
Refs #6391

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200507125022.22608-1-nyh@scylladb.com>
2020-05-07 17:48:34 +02:00
Tomasz Grabiec
2078016f84 test: memory_footprint: Avoid invalid identifiers as columnnames
Column name should not start with a digit, as can be the case with
random_string().

Message-Id: <1588860648-15796-1-git-send-email-tgrabiec@scylladb.com>
2020-05-07 17:33:34 +03:00
Pavel Emelyanov
ef181fb2d0 test: Add option to flush memtables for perf_simple_query
The test in question measures the speed of memtables, not
the row_cache. With this option it can do both.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200507140603.12350-1-xemul@scylladb.com>
2020-05-07 16:09:40 +02:00
Ivan Prisyazhnyy
84e25e8ba4 api: support table auto compaction control
The patch implements:

- /storage_service/auto_compaction API endpoint
- /column_family/autocompaction/{name} API endpoint

Those APIs allow to control and request the status of background
compaction jobs for the existing tables.

The implementation introduces the table::_compaction_disabled_by_user.
Then the CompactionManager checks if it can push the background
compaction job for the corresponding table.

New members
===

    table::enable_auto_compaction();
    table::disable_auto_compaction();
    bool table::is_auto_compaction_disabled_by_user() const

Test
===
Tests: unit(sstable_datafile_test autocompaction_control_test), manual

    $ ninja build/dev/test/boost/sstable_datafile_test
    $ ./build/dev/test/boost/sstable_datafile_test --run_test=autocompaction_control_test -- -c1 -m2G --overprovisioned --unsafe-bypass-fsync 1 --blocked-reactor-notify-ms 2000000

The test tries to submit a compaction job after playing
with autocompaction control table switch. However, there is
no reliable way to hook pending compaction task. The code
assumed that with_scheduling_group() closure will never
preempt execution of the stats check.

Revert
===
Reverts commit c8247ac. In previous version the execution
sometimes resulted into the following error:

    test/boost/sstable_datafile_test.cc(1076): fatal error: in "autocompaction_control_test":
    critical check cm->get_stats().pending_tasks == 1 || cm->get_stats().active_tasks == 1 has failed

This version adds a few sstables to the cf, starts
the compaction and awaits until it is finished.

API change
===

- `/column_family/autocompaction/` always returned `true` while answering to the question: if the autocompaction disabled (see https://github.com/scylladb/scylla-jmx/blob/master/src/main/java/org/apache/cassandra/db/ColumnFamilyStore.java#L321). now it answers to the question: if the autocompaction for specific table is enabled. The question logic is inverted. The patch to the JMX is required. However, the change is decent because all old values were invalid (it always reported all compactions are disabled).
- `/column_family/autocompaction/` got support for POST/DELETE per table

Fixes
===

Fixes #1488
Fixes #1808
Fixes #440

Signed-off-by: Ivan Prisyazhnyy <ivan@scylladb.com>
Reviewed-by: Glauber Costa <glauber@scylladb.com>
2020-05-07 16:23:38 +03:00
Nadav Har'El
e9aa1173e0 doc, alternator: better documentation for write isolation policies
Alternator supports four different write isolation policies, the default
being to do all the writes with LWT, but these policies were only briefly
explained in alternator.md.

This patch significantly expands on this explanation, better explaining
the tradeoffs involved in these four options, and when each might make
sense (if at all).

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200506235152.18190-1-nyh@scylladb.com>
2020-05-07 13:59:38 +02:00
Nadav Har'El
f12989ff73 alternator/test: minor cleanup in test_key_condition_expression.py
Some minor cleanups, mostly comments, in test_key_condition_expression.py

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200506212849.16207-1-nyh@scylladb.com>
2020-05-07 13:58:44 +02:00
Botond Dénes
791acc7f38 sstables: sstable_reader: fix read range upper bound calculation for reverse slices
The single-key sstable reader uses the clustering ranges from the slice
to determine the upper bound of the disk read-range using the index.
For this is simply uses the end bound of the last clustering ranges. For
reverse reads however the clustering ranges in the slice are in reverse
order, so this will in fact be the upper bound of the smallest range.
Depending on whether the distance between the clustering range is big
enough for the sstable reader to use the index to skip between them,
this will lead to either reading too little data or an assert failure.

This patch fixes the problematic function `get_slice_upper_bound()` to
consider reverse reads as well.

Initially I thought there will be more mishandling of reverse slices,
but actually `mutation_fragment_filter`, the component doing the actual
slicing of rows, is already reverse-slice aware.

A unit test which reproduces the assert failure is also added.

Fixes: #6171

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200507114956.271799-1-bdenes@scylladb.com>
2020-05-07 14:52:04 +03:00
Avi Kivity
bef8e5e930 Merge "Don't invalidate row cache when adding GC SStable to SSTable Set" from Raphael
"
Garbage collected SSTables, created by incremental compaction process,
are being added to the SSTable set using a function that invalidates
row cache using the range of the SSTable itself. That's incorrect
because data in GC SSTables come from preexisting SSTables in set,
meaning the state of data isn't changed and so no need for
invalidation at all. Incorrect invalidation like this is a source of
read performance issues. This problem is fixed by including GC
SSTables to the descriptor which is used to specify changes to the
SSTable set, which is the correct thing to do given that a midway
failure could leave the set in an incorrect state.

Fixes #5956.
Fixes #6275.

tests: unit(dev)
"

* 'fix_issue_5956_v4' of github.com:raphaelsc/scylla:
  sstables/compaction: Don't invalidate row cache when adding GC SSTable to SSTable set
  sstables/compaction: Change meaning of compaction_completion_desc input and output fields
  sstables/compaction: Clean up code around garbage_collected_sstable_writer
2020-05-07 14:10:49 +03:00
Glauber Costa
e8213fb5c3 compaction_manager: allow early aborts through abort sources.
The shutdown process of compaction manager starts with an explicit call
from the database object. However that can only happen everything is
already initialized. This works well today, but I am soon to change
the resharding process to operate before the node is fully ready.

One can still stop the database in this case, but reshardings will
have to finish before the abort signal is processed.

This patch passes the existing abort source to the construction of the
compaction_manager and subscribes to it. If the abort source is
triggered, the compaction manager will react to it firing and all
compactions it manages will be stopped.

We still want the database object to be able to wait for the compaction
manager, since the database is the object that owns the lifetime of
the compaction manager. To make that possible we'll use a future
that is return from stop(): no matter what triggered the abort, either
an early abort during initial resharding or a database-level event like
drain, everything will shut down in the right order.

The abort source is passed to the database, who is responsible from
constructing the compaction manager.

Tests: unit (dev), manual start+stop, manual drain + stop

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200506184749.98288-1-glauber@scylladb.com>
2020-05-07 13:24:47 +03:00
Asias He
71d0d58f8c Revert "config: Do not enable repair based node operations by default"
This reverts commit b8ac10c451.

The repair based node operations will be enabled by default in 4.1.
Revert the patch which disables it by default.
2020-05-07 13:17:35 +03:00
Avi Kivity
fbf2194b31 Merge 'cql3: Fix detection of bound variables in tuples' from Juliusz
This is unrelated to counters, but happens to fix #4209

`tuple::delayed_value::contains_bind_marker` used to check that
ALL terms are bound (not that ANY of them is bound). As a result,
scylla would crash in prepare codepath for collections of tuples.
After this fix `invalid_request_exception` is thrown instead.

* jul-stas-4209-crash-on-counter-shards-set:
  boost/tests: test for bound variable in a list of tuple literals
  cql3: fix detection of bound variables in tuples
2020-05-07 13:13:51 +03:00
Botond Dénes
2e09a0317c types, compound: pass std::current_exception() to on_internal_error()
So that  nested exceptions are not lost. Also, marshal exceptions, the
ones we have in these places, already have a backtrace, so might as well
use that, instead of creating a new one, loosing unwound frames.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200507091405.244544-1-bdenes@scylladb.com>
2020-05-07 11:25:25 +02:00
Juliusz Stasiewicz
7b48d8c33c boost/tests: test for bound variable in a list of tuple literals
This test checks that the list literals of tuples with some (but
not all!) bind markers are rejected.
2020-05-07 11:03:53 +02:00
Pavel Solodovnikov
55d89d2cbe lwt: add cql tests to test delete+insert behavior on the same row in one batch
Add a couple of cql tests regarding conditional batches:

 1. Verify that "delete" takes priority over "insert"
    when applied to the same row within the same batch.
 2. Test that a workaround for the issue works as expected (i.e.
    delete only individual cells instead of the full record).

Tests: unit(dev)
Fixes: #6273

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200506201200.176590-1-pa.solodovnikov@scylladb.com>
2020-05-07 10:53:22 +02:00
Tomasz Grabiec
b0f2d2bee0 Merge "lwt: fix linearisability issues with reads and writes with non met conditions" form Gleb
Fixes #6299.
2020-05-07 10:49:01 +02:00
Juliusz Stasiewicz
b46d7cf8d1 cql3: fix detection of bound variables in tuples
`tuple::delayed_value::contains_bind_marker` used to check that
ALL terms are bound (not that ANY of them is bound). As a result,
scylla would crash in prepare codepath for collections. After this
fix `invalid_request_exception` is thrown instead.

Fixes #4209
2020-05-07 10:44:52 +02:00
Benny Halevy
b2f50224d9 table: database_sstable_write_monitor: revert charges in destructor
We must unregister the monitor upon destruction to prevent use-after-free
from `compaction_backlog_tracker::backlog` path.

This is similar to ~compaction_read_monitor as implemented
in commit ca284174d0

Fixes #6385

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200506214419.569655-1-bhalevy@scylladb.com>
2020-05-07 10:39:39 +02:00
Nadav Har'El
0214f0ad60 main: really enable the "--start-native-transport" option
In commit da3bf20e71 we supposedly enabled
support for Cassandra's "start_native_transport" option which can be set to
0 to run Scylla without listening on the CQL port. This can be useful, for
example, if a user only want the DynamoDB or Redis APIs but not CQL.

Unfortunately, the option was still marked "Unused", so it wasn't really
enabled as a valid command line option. This patch fixes that, and
documents the start_native_transport option in docs/protocols.md, where
we document the different protocols, ports, and options to configure them.

Fixes #6387.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200506174850.13616-1-nyh@scylladb.com>
2020-05-07 11:09:18 +03:00
Avi Kivity
2b0c317dec test: lib: exception_utils: fix crash with fmt-6.2.0
fmt, the formatting library we use, detects types with conversion
to std::string_view (and formats them as strings) and types that
support operator<<(std::ostream, const T&) (and performs custom
formatting on them). However, if <fmt/ostream.h>, the latter is
not done.

The problem happens with seastar::sstring, which implements both,
and debug mode, which disables inlining. Some translation units
do include <fmt/ostream.h>, and so generate code to do custom
formatting. exception_utils.cc doesn't, and so generates code
to format via string_view conversion. At link time, the
compiler picks one of the generated functions and includes it
in the final binary; it happened to pick one generated outside
exception_utils.cc, using custom formatting.

However, there is also code in fmt to encode which path fmt
chose - string_view or custom. This code is constexpr and so
is evaluated in exception_utils.cc. The result is that the
function to perform formatting of seastar::sstring uses custom
formatting, while the descriptor containing the method used
says it is formatting via string_view. This is enough to cause
a crash.

The problem is limited to debug mode, since in other modes
all this code is inlined, and so is consistent within the
translation unit.

We need a more general fix (hopefully in fmt), but for now a
simple fix is to add the missing include.

Ref https://github.com/fmtlib/fmt/issues/1662
2020-05-07 08:59:02 +03:00
Avi Kivity
6f1a8cfeea Merge 'Use special partitioner for CDC Log' from Piotr
"
CDC has to create CDC streams that are co-located with corresponding BaseTable data. This is not always easy. Especially for small vnodes. This PR introduces new partitioner which allows us to easily find such stream ids that the stream belongs to a given vnode and shard.

The idea is that a partitioner accepts only keys that are a blob composed of two int64 numbers. The first number is the token of the key.

Tests: unit(dev), dtests(CDC)
"

* haaawk-cdc_partitioner:
  cdc:use CDCPartitioner for CDC Log
  dht: Add find_first_token_for_shard
  dht: use long_token in token::to_int64
  cdc: add CDCPartitioner
  stream_id: add token_from_bytes static function
  i_partitioner: Stop distinguishing whether keys order is preserved
2020-05-06 20:29:27 +03:00
Pavel Solodovnikov
1d3f9174c5 cql3: avoid using shared_ptr's in unrecognized_entity_exception
Using shared_ptr's in `unrecognized_entity_exception` can lead
to cross-cpu deletion of a pointer which will trigger an assert
`_cpu == std::this_thread::get_id()' when shared_ptr is disposed.

Copy `column_identifier` to the exception object and avoid using
an instance of `cql3::relation`: just get a string representation
from it since nothing more is used in associated exception
handling code.

Fixes: #6287
Tests: unit(dev, debug), dtest(lwt_destructive_ddl_test.py:LwtDestructiveDDLTest.test_rename_column)

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200506155714.150497-1-pa.solodovnikov@scylladb.com>
2020-05-06 19:02:36 +03:00
Piotr Sarna
f48e414eab db, view: remove duplicate entries from pending endpoints
When generating view updates, an endpoint can appear both
as a primary paired endpoint for the view update, and as a pending
endpoint (due to range movements). In order not to generate
the same update twice for the same endpoint, the paired endpoint
is removed from the list of pending endpoints if present.

Fixes #5459
Tests: unit(dev),
       dtest(TestMaterializedViews.add_dc_during_mv_insert_test)
2020-05-06 16:42:56 +03:00
Benny Halevy
682fb3acfd api: storage_service: serialize true_snapshot_size
Following up on 91b71a0b1a
We also need to serialize storage_service::true_snapshots_size
with snapshot-modifying operations.

It seems like it was assumed that get_snapshot_details
is done under run_snapshot_list_operation, but the one called
here is the table method, not the api::storage_service::get_snapshot_details.

Fixes #5603

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200506115732.483966-1-bhalevy@scylladb.com>
2020-05-06 15:33:38 +03:00
Pavel Solodovnikov
b183530f2c cql3: use lw_shared_ptr instead of shared_ptr for column_condition
Both `cql3::column_condition` and `cql3::column_condition::raw`
classes are marked as `final`: it's safe to use lw_shared_ptr
instead of generic `seastar::shared_ptr`.

Tests: unit(dev, debug)

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200428202249.82785-1-pa.solodovnikov@scylladb.com>
2020-05-06 13:11:07 +03:00
Nadav Har'El
ddb483461a test/alternator: xfailing tests for FilterExpression feature
This patch adds a comprehensive, hopefully complete, test for the
yet-unimplemented FilterExpression feature. FilterExpression is the
modern syntax which allows filtering the results of Query and Scan requests.
The patch includes 50 tests spanning more than 700 lines of code,
testing (hopefully) all the various FilterExpression features,
sub-cases, syntax peculiarities, and so on.

As usual, all included tests pass when run against DynamoDB
("pytest --aws") and xfail when run against Scylla.

This test should be helpful to understand how to implement
FilterExpression correctly, as well as test the future implementation.

Refs #5038.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200503165639.15320-1-nyh@scylladb.com>
2020-05-06 12:56:20 +03:00
Botond Dénes
6de51db84a tools: introduce scylla_types
We often have to examine raw values, obtained from various sources, like
sstables, logs and coredumps. For some types it is quite simple to
convert raw hex values to human readable ones manually (integers), for
others it is very hard or simply not practical. This command-line tool
aims to ease working with raw values, by providing facilities to print
them in human readable form and compare them. We can extend it with more
functions as needed.

Examples:
$ scylla_types -a print -t Int32Type b34b62d4
-1286905132

$ scylla_types -a compare -t 'ReversedType(TimeUUIDType)' b34b62d46a8d11ea0000005000237906 d00819896f6b11ea00000000001c571b
b34b62d4-6a8d-11ea-0000-005000237906 > d0081989-6f6b-11ea-0000-0000001c571b

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200505124914.104827-1-bdenes@scylladb.com>
2020-05-06 12:56:20 +03:00
Avi Kivity
bf2ab10b6a Update seastar submodule
* seastar 3c2e27811...e708d1df3 (10):
  > Merge "Fix a few issues found by clang's asan" from Rafael
  > seastar: app_template: allow a description to be provided for the app
  > membarrier: fix madvise(MADV_DONTNEED) failure and crash with --lock-memory
Fixes #6346
  > rpc::compressor: Fix static init fiasco with names
  > fair_queue: express all internal fair_queue quantities as fair_queue_tickets
  > net: remove API v1 compatibility layer (variadic future in networking)
  > testing: Move parts of the exchanger out of line
  > on_internal_error: add overload taking an std::exception_ptr
  > tuple_utils: Add a missing include
  > Merge "Fix use of uninitialized found by valgrind" from Rafael
2020-05-06 12:56:20 +03:00
Raphael S. Carvalho
a214ccdf89 sstables/compaction: Don't invalidate row cache when adding GC SSTable to SSTable set
Garbage collected SSTable is incorrectly added to SSTable set with a function
that invalidates row cache. This problem is fixed by adding GC SStable
to set using mechanism which replaces old sstables with new sstables.

Also, adding GC SSTable to set in a separate call is not correct.
We should make sure that GC SSTable reaches the SSTable set at the same time
its respective old (input) SSTable is removed from the set, and that's done
using a single request call to table.

Fixes #5956.
Fixes #6275.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
2020-05-05 12:03:19 -03:00
Raphael S. Carvalho
8f4458f1d5 sstables/compaction: Change meaning of compaction_completion_desc input and output fields
input_sstables is renamed to old_sstables and is about old SSTables that should be
deleted and removed from the SSTable set.
output_sstables is renamed to new_sstables and is about new SSTable that should be
added to the SSTable set, replacing the old ones.

This will allow us, for example, to add auxiliary SSTables to SSTable set using
the same call which replaces output SSTables by input SSTables in compaction.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
2020-05-05 12:03:08 -03:00
Raphael S. Carvalho
cc5e0d8da8 sstables/compaction: Clean up code around garbage_collected_sstable_writer
This cleanup allows us to get rid of the ugly compaction::create_new_sstable(),
and reduce complexity by getting rid of observable.

garbage_collected_sstable_writer::data is introduced to allow compaction to
directly communicate with the GC writer, which is stored in mutation_compaction,
making it unreachable after the compaction has started. By making compaction
store GC writer's data and using that same data to create g__c__s__w,
compaction is able to communicate with GC writer without the complexity of
observable utility. This move is important for the subsequent work which
will fix a couple of issues regarding management of GC SSTables.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
2020-05-05 12:02:41 -03:00
Piotr Sarna
b8df958811 alternator: deduplicate logs on boot
Alternator server used to print a startup log line for each shard,
which is redundant and creates churn for nodes with many cores.
Instead of all that, a single line is now printed once alternator
server properly boots.

Fixes #6347
Tests: manual(boot), unit(dev)
2020-05-05 16:19:18 +03:00
Gleb Natapov
4622c61a37 lwt: linearise reads
Currently the following scenario may happen:

Consider 3 nodes A, B and C and a LWT failed write operation that
managed to get V accepted on A. The value is read twice. First read
access B and C and returns nothing. Next one access A and B, notices
failed round and completes it. Returns value V. Since two consequent
reads without any writes in the middle return different value this
breaks linearisability.

This happens because read does not do full paxos round. The patch
makes read code to reuse the same logic as write by writing a dummy
value which ensures that complete paxos round is used.
2020-05-05 15:37:42 +03:00
Gleb Natapov
0c2db6f42d lwt: linearise unmet condition operations
Currently the following scenario may happen:

Consider 3 nodes A, B and C and a LWT failed write operation that
managed to get V accepted on A. Next operation may be conditioned on a
value been V, but it may access nodes B and C first and fail. Retrying
the same operation without any writes in the middle may now access A
and B and succeed since it will notice V and will complete previous
transaction. Having to different outcome for the same operation without
any writes in the middle breaks linearisability.

This happens because when condition is unmet we abandon the paxos round,
so this patch makes us complete it with empty value. Now if first
conditional write after failure access B and C it will write accepted
ballot there with the value greater than one of V and V will no longer be
replayed ever.
2020-05-05 12:38:31 +03:00
Gleb Natapov
0fed86e4c6 lwt: change cas_request::apply signature
Change the way query result is passed from getting a reference to a
result to getting a foreign_ptr<lw_shared_ptr<query::result>>. This will
allow cas_request to keep it without copying.
2020-05-05 12:38:23 +03:00
Benny Halevy
580d397d2e test: database_test: do_with_some_data: retain tmpdir for test duration
Currently, the test seems to use the tmpdir class in a wrong way,
just to get a path to a temporary directory.

It should keep the tmpdir object around for the duration of the test
so the temporary directory will be automatically removed when the test
completes.

Refs #6344

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200504153810.202218-1-bhalevy@scylladb.com>
2020-05-05 11:37:18 +03:00
Piotr Sarna
1c4e8f5030 alternator: fix checking max item depth
Maximum item depth accepted by DynamoDB is 32, and alternator
chose 39 as its arbitrary value in order to provide 7 shining
new levels absolutely free of charge. Unfortunately, our code
which checks the nesting level in rapidjson parsing bumps
the counter by 2 for every object, which is due to rapidjson's
internal implementation. In order to actually support
at least 32 levels, the threshold is simply doubled.
This commit comes with a test case which ensures that
32-nested items are accepted both by alternator and DynamoDB.
The test case failed for alternator before the fix.

Fixes #6366
Tests: unit(dev), alternator(local, remote)
2020-05-04 23:46:20 +03:00
Glauber Costa
c5cdd77f8e gossip_test: start the compaction manager explicitly
Right now the compaction_manager needs to be started explicitly.
We may change it in the future, but right now that's how it is.

Everything works now even without it, because compaction_manager::stop
happens to work even if it was not started. But it is technically
illegal.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200504143048.17201-1-glauber@scylladb.com>
2020-05-04 17:40:32 +03:00
Bentsi Magidovich
e77dad3adf scylla_coredump_setup: Fix incorrect coredump directory mount
The issue is that the mount is /var/lib/scylla/coredump ->
/var/lib/systemd/coredump. But we need to do the opposite in order to
save the coredump on the partition that Scylla is using:
/var/lib/systemd/coredump-> /var/lib/scylla/coredump

Fixes #6301
2020-05-04 15:47:45 +03:00
Avi Kivity
f3bcd4d205 Merge 'Support SSL Certificate Hot Reloading' from Calle
"
Fixes #6067

Makes the scylla endpoint initializations that support TLS use reloadable certificate stores, watching used cert + key files for changes, and reload iff modified.

Tests in separate dtest set.
"

* elcallio-calle/reloadable-tls:
  transport: Use reloadable tls certificates
  redis: Use reloadable tls certificates
  alternator: Use reloadable tls certificates
  messaging_service: Use reloadable TLS certificates
2020-05-04 15:11:16 +03:00
Piotr Sarna
bec95a0605 treewide: use thread-safe variant of localtime
In order to ensure thread-safety, all usages of localtime()
are replaced with localtime_r(), which may accept a local
buffer.

Tests: unit(dev)
Fixes #6364
Message-Id: <ad4a0c0e1707f0318325718715a3a647e3ebfdfe.1588592156.git.sarna@scylladb.com>
2020-05-04 14:46:08 +03:00
Calle Wilund
70aca26a3e transport: Use reloadable tls certificates 2020-05-04 11:32:21 +00:00
Calle Wilund
bacf2fa981 redis: Use reloadable tls certificates 2020-05-04 11:32:21 +00:00
Calle Wilund
cc9bb6454c alternator: Use reloadable tls certificates 2020-05-04 11:32:21 +00:00
Calle Wilund
08d069f78d messaging_service: Use reloadable TLS certificates
Changes messaging service rpc to use reloadable tls
certificates iff tls is enabled-

Note that this means that the service cannot start
listening at construction time if TLS is active,
and user need to call start_listen_ex to initialize
and actually start the service.

Since "normal" messaging service is actually started
from gms, this route too is made a continuation.
2020-05-04 11:32:21 +00:00
Piotr Sarna
fb7fa7f442 alternator: fix signature timestamps
Generating timestamps for auth signatures used a non-thread-safe
::gmtime function instead of thread-safe ::gmtime_r.

Tests: unit(dev)
Fixes #6345
2020-05-04 14:12:11 +03:00
Piotr Sarna
05ec95134a clocks-impl: switch to thread-safe time conversion
std::gmtime() has a sad property of using a global static buffer
for returning its value. This is not thread-safe, so its usage
is replaced with gmtime_r, which can accept a local buffer.
While no regressions where observed in this particular area of code,
a similar bug caused failures in alternator, so it's better to simply
replace all std::gmtime calls with their thread-safe counterpart.

Message-Id: <39e91c74de95f8313e6bb0b12114bf12c0e79519.1588589151.git.sarna@scylladb.com>
2020-05-04 14:11:38 +03:00
Takuya ASADA
57f3f82ed1 redis: add EX option for set command
Add EX option for SET command, to set TTL for the key.
A behavior of SET EX is same as SETEX command, it just different syntax.

see: https://redis.io/commands/set
2020-05-04 13:58:18 +03:00
Eliran Sinvani
a346e862c1 Auth: return correct error code when role is not found
Scylla returns the wrong error code (0000 - server internal error)
in response to trying to do authentication/authorization operations
that involves a non-existing role.
This commit changes those cases to return error code 2200 (invalid
query) which is the correct one and also the one that Cassandra
returns.
Tests:
    Unit tests (Dev)
    All auth and auth_role dtests
2020-05-04 12:57:27 +03:00
Glauber Costa
55f5ca39a9 sstable_test: rework test to use a thread
The compaction_manager test lives inside a thread and it is not taking
advantage of it, with continuations all over.

One of the side effects of it is that the test is calling stop() twice
on the compaction_manager.  While this works today, it is not good
practice. A change I am making is just about to break it.

This patch converts the test to fully use .get() instead of chained
continuations and in doing so also guarantees that the compaction
manager will be RAII-stopped just one, from a defer object.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200503161420.8346-2-glauber@scylladb.com>
2020-05-03 19:54:04 +03:00
Piotr Sarna
bf5f247bc5 db: set gc grace period to 0 for local system tables
Local system tables from `system` namespace use LocalStrategy
replication, so they do not need to be concerned about gc grace
period. Some system tables already set gc grace period to 0,
but other ones, including system.large_partitions, did not.
That may result in millions of tombstones being needlessly
kept for these tables, which can cause read timeouts.

Fixes #6325
Tests: unit(dev), local(running cqlsh and playing with system tables)
2020-05-03 17:41:50 +03:00
Avi Kivity
9952cdfec1 Merge "scylla-gdb.py: improve finding references to intrusive container elements" from Botond
"
Intrusive containers often have references between containers elements
that point to some non-first word of the element. This references
currently fly below the radar of `scylla find` and `scylla
generate-object-graph`, as they are looking to references to only the
first word of the objects. So objects that are members of an intrusive
container often appear to have no inbound references at all.

This patch-set improves support for finding such references by looking
for references to non-first words of objects.

It also includes some generic, minor improvements to scylla
generate_object_graph.
"

* 'scylla-gdb.py-scylla-generate-object-graph-linked-lists/v1' of https://github.com/denesb/scylla:
  scylla-gdb.py: scylla generate_object_graph: make label of initial vertice bold
  scylla-gdb.py: scylla generate_object_graph: remove redundant lookup
  scylla-gdb.py: scylla generate_object_graph: print "to" offsets
  scylla-gdb.py: scylla generate-object-graph: use value-range to find references
  scylla-gdb.py: scylla find: allow finding ranges of values
  scylla-gdb.py: find_in_live(): return pointer_metadata instances
2020-05-03 16:22:22 +03:00
Glauber Costa
70e5252a5d table: no longer accept online loading of SSTable files in the main directory
Loading SSTables from the main directory is possible, to be compatible with
Cassandra, but extremely dangerous and not recommended.

From the beginning, we recommend using an separate, upload/ directory.
In all this time, perhaps due to how the feature's usefulness is reduced
in Cassandra due to the possible races, I have never seen anyone coming
from Cassandra doing procedures involving refresh at all.

Loading SSTables from the main directory forces us to disable writes to
the table temporarily until the SSTables are sorted out. If we get rid of
this, we can get rid of the disabling of the writes as well.

We can't do it now because if we want to be nice to the odd user that may
be using refresh through the main directory without our knowledge we should
at least error out.

This patch, then, does that: it errors out if SSTables are found in the main
directory. It will not proceed with the refresh, and direct the user to the
upload directory.

The main loop in reshuffle_sstables is left in place structurally for now, but
most of it is gone. The test for is is deleted.

After a period of deprecation we can start ignoring these SSTables and get rid
of the lock.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200429144511.13681-1-glauber@scylladb.com>
2020-05-03 08:40:38 +03:00
Glauber Costa
e44b2826ab compaction: avoid abandoned futures when using interposers
When using interposers, cancelling compactions can leave futures
that are not waited for (resharding, twcs)

The reason is when consume_end_of_stream gets called, it tries to
push end_of_stream into the queue_reader_handle. Because cancelling
a compaction is done through an exception, the queue_reader_handle
is terminated already at this time. Trying to push to it generates
another exception and prevents us from returning the future right
below it.

This patch adds a new method is_terminated() and if we detect
that the queue_reader_handle is already terminated by this point,
we don't try to push. We call it is_terminated() because the check
is to see if the queue_reader_handle has a _reader. The reader is
also set to null on successful destruction.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200430175839.8292-1-glauber@scylladb.com>
2020-05-01 16:30:23 +03:00
Avi Kivity
122f57871d Update seastar submodule
* seastar 0523b0fac...3c2e27811 (2):
  > future: Add a futurizer::satisfy_with_result_of
  > future: Move concept definitions earlier
2020-05-01 12:55:48 +03:00
Tomasz Grabiec
d78fbf7c16 Merge "storage_service: Make replacing node take writes" from Asias
Background:

Replace operation is used to replace a dead node in the cluster.
Currently during replace operation, the replacing node does not take any
writes. As a result, new writes to a range after the sync for that range
is done, e.g., after streaming for that range is finished, will not be
synced to the replacing node. Hinted hand off or repair after the
replacing operation will help. But it is better if we can make the
writes to the replacing node to avoid any post replacing operation
actions.

After this series and repair based node operation series, the replace
operation will guarantee the replacing node has all the latest copy of
data including the new writes during the replace operation. In short, no
more repairs before or after the replacing operation. Just replacing the
node is enough.

Implementation:

Filter the node being replaced out of the natural endpoints in
storage_proxy, so that:
The node being replaced will not be selected as the target for
normal write or normal read.

Do not depend on the gossip liveness to avoid selecting replacing node
for normal write or normal read when the replacing node has the same
ip address as the node being replaced. No more special handling for
hibernate state in gossip which makes it is simpler and more robust.
Replacing node will be marked as UP.

Put the replacing node in the pending list, so that:
Replacing node will take writes but write to replacing will not be
counted as CL.

Replacing node will not take normal read.

Example:

For example, with RF = 3, n1, n2, n3 in the cluster, n3 is dead and
being replaced by node n4. When n4 starts:

writes to nodes {n1, n2, n3} are changed to
normal_replica_writes = {n1, n2} and pending_replica_writes= {n4}.

reads to nodes {n1, n2, n3} are changed to
normal_replica_reads = {n1, n2} only.

This way, the replacing node n4 now takes writes but does not take reads.

Tests:

Measure the number of writes during pending period that is the
replacing starts and finishes the replace operation.
Start 5 nodes, n1 to n5.
Stop n5
Start write in the background
Start n6 to replace n5
Get scylla_database_total_writes metrics when the replacing node announces HIBERNATE (replacing) and NORMAL status.
Before:
2020-02-06 08:35:35.921837 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:35:35.939493 scylla_database_total_writes: node1={'scylla_database_total_writes': 15483}
2020-02-06 08:35:35.950614 scylla_database_total_writes: node2={'scylla_database_total_writes': 15857}
2020-02-06 08:35:35.961820 scylla_database_total_writes: node3={'scylla_database_total_writes': 16195}
2020-02-06 08:35:35.978427 scylla_database_total_writes: node4={'scylla_database_total_writes': 15764}
2020-02-06 08:35:35.992580 scylla_database_total_writes: node6={'scylla_database_total_writes': 331}
2020-02-06 08:36:49.794790 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:36:49.809189 scylla_database_total_writes: node1={'scylla_database_total_writes': 267088}
2020-02-06 08:36:49.823302 scylla_database_total_writes: node2={'scylla_database_total_writes': 272352}
2020-02-06 08:36:49.837228 scylla_database_total_writes: node3={'scylla_database_total_writes': 274004}
2020-02-06 08:36:49.851104 scylla_database_total_writes: node4={'scylla_database_total_writes': 262972}
2020-02-06 08:36:49.862504 scylla_database_total_writes: node6={'scylla_database_total_writes': 513}

Writes = 513 - 331

After:
2020-02-06 08:28:56.548047 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:28:56.560813 scylla_database_total_writes: node1={'scylla_database_total_writes': 290886}
2020-02-06 08:28:56.573925 scylla_database_total_writes: node2={'scylla_database_total_writes': 310304}
2020-02-06 08:28:56.586305 scylla_database_total_writes: node3={'scylla_database_total_writes': 304049}
2020-02-06 08:28:56.601464 scylla_database_total_writes: node4={'scylla_database_total_writes': 303770}
2020-02-06 08:28:56.615066 scylla_database_total_writes: node6={'scylla_database_total_writes': 604}
2020-02-06 08:29:10.537016 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:29:10.553257 scylla_database_total_writes: node1={'scylla_database_total_writes': 336126}
2020-02-06 08:29:10.567181 scylla_database_total_writes: node2={'scylla_database_total_writes': 358549}
2020-02-06 08:29:10.581939 scylla_database_total_writes: node3={'scylla_database_total_writes': 351416}
2020-02-06 08:29:10.595567 scylla_database_total_writes: node4={'scylla_database_total_writes': 350580}
2020-02-06 08:29:10.610548 scylla_database_total_writes: node6={'scylla_database_total_writes': 45460}

Writes = 45460 - 604

As we can see the replacing node did not take write before and take write after the patch.

Check log of writer handler in storage_proxy
storage_proxy - creating write handler for token: -2642068240672386521,
keyspace_name=ks, original_natrual={127.0.0.1, 127.0.0.5, 127.0.0.2},
natural={127.0.0.1, 127.0.0.2}, pending={127.0.0.6}

The node being replaced, n5=127.0.0.5, is filtered out and the replacing
node, n6=127.0.0.6 is in the pending list.

* asias/replace_take_writes:
  storage_service: Make replacing node take writes
  repair: Use token_metadata with the replacing node in do_rebuild_replace_with_repair
  abstract_replication_strategy: Add get_ranges which takes token_metadata
  abstract_replication_strategy: Add get_natural_endpoints_without_node_being_replaced
  abstract_replication_strategy: Add allow_remove_node_being_replaced_from_natural_endpoints
  token_metadata: Calculate pending ranges for replacing node
  storage_service: Unify handling of replaced node removal from gossip
  storage_service: Update tokens and replace address for replace operation
2020-04-30 19:28:35 +02:00
Pavel Emelyanov
513ce1e6a5 storage_proxy_stats: Make get_ep_stat() noexcept
The .get_ep_stat(ep) call can throw when registering metrics (we have
issue for it, #5697). This is not expected by it callers, in particular
abstract_write_response_handler::timeout_cb breaks in the middle and
doesn't call the on_timeout() and the _proxy->remove_response_handler(),
which results in not removed and not released responce handler. In turn
not released response handler doesn't set the _ready future on which
response_wait() waits -> stuck.

Although the issue with .get_ep_stat() should be fixed, an exception in
it mustn't lead to deadlocks, so the fix is to make the get_ep_stat()
noexcept by catching the exception and returning a dummy stat object
instead to let caller(s) finish.

Fixes #5985
Tests: unit(dev)

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200430163639.5242-1-xemul@scylladb.com>
2020-04-30 19:40:08 +03:00
Avi Kivity
88224619b6 Update seastar submodule
* seastar d0cbf7d1e8...0523b0fac4 (1):
  > Merge "Fix issues found by valgrind" from Rafael
2020-04-30 19:20:37 +03:00
Asias He
b8ac10c451 config: Do not enable repair based node operations by default
Give it some more time to mature. Use the old stream plan based node
operations by default.

Fixes: #6305
Backports: 4.0
2020-04-30 12:37:24 +03:00
Avi Kivity
8925e00e96 Merge 'Fix hang in multishard_writer' from Asias
"
This series fix hang in multishard_writer when error happens. It contains
- multishard_writer: Abort the queue attached to consumers when producer fails
- repair: Fix hang when the writer is dead

Fixes #6241
Refs: #6248
"

* asias-stream_fix_multishard_writer_hang:
  repair: Fix hang when the writer is dead
  mutation_writer_test: Add test_multishard_writer_producer_aborts
  multishard_writer: Abort the queue attached to consumers when producer fails
2020-04-30 12:27:55 +03:00
Avi Kivity
280854ab46 Merge " Avoid use-after-free of sstable writer" from Rafael
"
The backlog_controller has a timer that periodically accesses the
sstable writers of ongoing writes.

This patch series makes sure we remove entries from the list of ongoing
writes before the corresponding sstable writer is destroyed.

Fixes #6221.
"

* 'espindola/fix-6221-v5' of https://github.com/espindola/scylla:
  sstables: Call revert_charges in compaction_write_monitor::write_failed
  sstables: Call monitor->write_failed earlier.
  sstables: Add write_failed to the write_monitor interface
2020-04-30 12:21:27 +03:00
Pekka Enberg
5c6265d14b Merge 'redis: add setex and ttl commands' from Takuya
"Enabling TTL feature, add setex and ttl commands to use it."

* 'redis_setex_ttl' of git://github.com/syuu1228/scylla:
  redis: add test for setex/ttl
  redis: add ttl command
  redis: add setex command
2020-04-30 09:39:48 +03:00
Pekka Enberg
d4c0d80f13 Merge 'redis: add lolwut test' from Takuya
"Add test for lolwut command, and also fix a bug on lolwut found by the test."

* 'redis_lolwut_test' of git://github.com/syuu1228/scylla:
  redis: lolwut parameter fix
  redis-test: add lolwut test
2020-04-30 09:30:43 +03:00
Piotr Sarna
c7c8bd0978 Update seastar submodule
* seastar 8fae03c2...d0cbf7d1 (6):
  > tests: restore compatibility with C++14 (broken due to std::filesystem)
  > http: make headers case-insensitive
  > on_internal_error: add scoped_no_abort_on_internal_error
  > Merge "make when_all functions noexcept" from Benny
  > chunked_fifo: fix underflow in reserve()
  > doc: document compatibility promises

Fixes #6319
2020-04-30 07:29:23 +02:00
Asias He
7d86a3b208 storage_service: Make replacing node take writes
Background:

Replace operation is used to replace a dead node in the cluster.
Currently during replace operation, the replacing node does not take any
writes. As a result, new writes to a range after the sync for that range
is done, e.g., after streaming for that range is finished, will not be
synced to the replacing node. Hinted hand off or repair after the
replacing operation will help. But it is better if we can make the
writes to the replacing node to avoid any post replacing operation
actions.

After this series and repair based node operation series, the replace
operation will guarantee the replacing node has all the latest copy of
data including the new writes during the replace operation. In short, no
more repairs before or after the replacing operation. Just replacing the
node is enough.

Implementation:

1) Filter the node being replaced out of the natural endpoints in
   storage_proxy, so that:

- The node being replaced will not be selected as the target for
  normal write or normal read.

- Do not depend on the gossip liveness to avoid selecting replacing node
  for normal write or normal read when the replacing node has the same
  ip address as the node being replaced. No more special handling for
  hibernate state in gossip which makes it is simpler and more robust.
  Replacing node will be marked as UP.

2) Put the replacing node in the pending list, so that:

- Replacing node will take writes but write to replacing will not be
  counted as CL.

- Replacing node will not take normal read.

Example:

For example, with RF = 3, n1, n2, n3 in the cluster, n3 is dead and
being replaced by node n4. When n4 starts:

- writes to nodes {n1, n2, n3} are changed to
  normal_replica_writes = {n1, n2} and pending_replica_writes= {n4}.

- reads to nodes {n1, n2, n3} are changed to
  normal_replica_reads = {n1, n2} only.

This way, the replacing node n4 now takes writes but does not take reads.

Tests:

1) Measure the number of writes during pending period that is the
   replacing starts and finishes the replace operation.

- Start 5 nodes, n1 to n5.
- Stop n5
- Start write in the background
- Start n6 to replace n5
- Get scylla_database_total_writes metrics when the replacing node announces HIBERNATE (replacing) and NORMAL status.

Before:
2020-02-06 08:35:35.921837 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:35:35.939493 scylla_database_total_writes: node1={'scylla_database_total_writes': 15483}
2020-02-06 08:35:35.950614 scylla_database_total_writes: node2={'scylla_database_total_writes': 15857}
2020-02-06 08:35:35.961820 scylla_database_total_writes: node3={'scylla_database_total_writes': 16195}
2020-02-06 08:35:35.978427 scylla_database_total_writes: node4={'scylla_database_total_writes': 15764}
2020-02-06 08:35:35.992580 scylla_database_total_writes: node6={'scylla_database_total_writes': 331}
2020-02-06 08:36:49.794790 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:36:49.809189 scylla_database_total_writes: node1={'scylla_database_total_writes': 267088}
2020-02-06 08:36:49.823302 scylla_database_total_writes: node2={'scylla_database_total_writes': 272352}
2020-02-06 08:36:49.837228 scylla_database_total_writes: node3={'scylla_database_total_writes': 274004}
2020-02-06 08:36:49.851104 scylla_database_total_writes: node4={'scylla_database_total_writes': 262972}
2020-02-06 08:36:49.862504 scylla_database_total_writes: node6={'scylla_database_total_writes': 513}

Writes = 513 - 331

After:
2020-02-06 08:28:56.548047 Get metrics when other knows replacing node = HIBERNATE
2020-02-06 08:28:56.560813 scylla_database_total_writes: node1={'scylla_database_total_writes': 290886}
2020-02-06 08:28:56.573925 scylla_database_total_writes: node2={'scylla_database_total_writes': 310304}
2020-02-06 08:28:56.586305 scylla_database_total_writes: node3={'scylla_database_total_writes': 304049}
2020-02-06 08:28:56.601464 scylla_database_total_writes: node4={'scylla_database_total_writes': 303770}
2020-02-06 08:28:56.615066 scylla_database_total_writes: node6={'scylla_database_total_writes': 604}
2020-02-06 08:29:10.537016 Get metrics when other knows replacing node = NORMAL
2020-02-06 08:29:10.553257 scylla_database_total_writes: node1={'scylla_database_total_writes': 336126}
2020-02-06 08:29:10.567181 scylla_database_total_writes: node2={'scylla_database_total_writes': 358549}
2020-02-06 08:29:10.581939 scylla_database_total_writes: node3={'scylla_database_total_writes': 351416}
2020-02-06 08:29:10.595567 scylla_database_total_writes: node4={'scylla_database_total_writes': 350580}
2020-02-06 08:29:10.610548 scylla_database_total_writes: node6={'scylla_database_total_writes': 45460}

Writes = 45460 - 604

As we can see the replacing node did not take write before and take write after the patch.

2) Check log of writer handler in storage_proxy

storage_proxy - creating write handler for token: -2642068240672386521,
keyspace_name=ks, original_natrual={127.0.0.1, 127.0.0.5, 127.0.0.2},
natural={127.0.0.1, 127.0.0.2}, pending={127.0.0.6}

The node being replaced, n5=127.0.0.5, is filtered out and the replacing
node, n6=127.0.0.6 is in the pending list.

Fixes: #5482
2020-04-30 10:22:30 +08:00
Asias He
e3fbc8fba1 repair: Use token_metadata with the replacing node in do_rebuild_replace_with_repair
We will change the update of tokens in token_metadata in the next patch
so that the tokens of the replacing node are updated to token_metadata
only after the replace operation is done. In order to get the correct
ranges for the replacing node in do_rebuild_replace_with_repair, we need
to use a copy of token_metadata contains the tokens of the replacing
node.

Refs: #5482
2020-04-30 10:22:30 +08:00
Asias He
b640614aa6 abstract_replication_strategy: Add get_ranges which takes token_metadata
It is useful when the caller wants to calculate ranges using a
custom token_metadata.

It will be used soon in do_rebuild_replace_with_repair for replace
operation.

Refs: #5482
2020-04-30 10:22:30 +08:00
Asias He
37d3d3e051 abstract_replication_strategy: Add get_natural_endpoints_without_node_being_replaced
Similar to natural_endpoints but with the node being replaced filtered out.

Refs: #5482
2020-04-30 10:22:30 +08:00
Asias He
1a75a60cfc abstract_replication_strategy: Add allow_remove_node_being_replaced_from_natural_endpoints
Decide if the replication strategy allow removing the node being replaced from
the natural endpoints when a node is being replaced in the cluster.
LocalStrategy is the not allowed to do so because it always returns the node
itself as the natural_endpoints and the node will not appear in the
pending_endpoints.

It is needed by the "Make replacing node take writes" work.

Refs: #5482
2020-04-30 10:22:30 +08:00
Pekka Enberg
eac9e253e7 sstables: Fix open-coded version parsing in make_descriptor()
The make_descriptor() function parses a string representation of sstable
version using a ternary operator. Clean it up by using
sstables::from_string(), which is future-proof when we add support for
later sstable formats.
Message-Id: <20200429082126.15944-1-penberg@scylladb.com>
2020-04-29 16:25:12 +02:00
Asias He
bd6691301e token_metadata: Calculate pending ranges for replacing node
It will be needed soon for making replace node take writes.

Refs: #5482
2020-04-29 16:02:10 +08:00
Asias He
75cf1d18b5 storage_service: Unify handling of replaced node removal from gossip
Currently, after the replacing node finishes the replace operation, it
removes the node being replaced from gossip directly in
storage_service::join_token_ring() with gossiper::replaced_endpoint(),
so the gossip states for the replaced node is gone.

When other nodes knows the replace operation is done, they will call
storage_service::remove_endpoint() and gossiper::remove_endpoint() to
quarantine the node but keep the gossip states. To prevent the
replacing node learns the state of replaced node again from existing
node again, the replacing node uses 2X quarantine time.

This makes the gossip states for the replaced node different on other
nodes and replacing nodes. It makes it is harder to reason about the
gossip states because the discrepancy of the states between nodes.

To fix, we unify the handling of replaced node on both replacing node
and other nodes. On all the nodes, once the replacing node becomes
NORMAL status, we remove the replaced node from token_metadata and
quarantine it but keep the gossip state. Since the replaced node is no
longer a member of the cluster, the fatclient timer will count and
expire and remove the replaced node from gossip.

Refs: #5482
2020-04-29 16:02:10 +08:00
Asias He
66c1907524 storage_service: Update tokens and replace address for replace operation
The motivation is to make the replacing node has the same view of the
token ring as the rest of the cluster.

If the replacing node has the same ip of the node being replaced, we
should update the tokens in token_metadata when the replace operation
starts, so that this replacing node and the rest of the cluster see the
same token ring.

If the replacing node has the different ip address of the node being
replaced, we should update the tokens in token_metadata only when
replace operation is done, because the other nodes will update the
replacing node's token in token_metadata when the replace operation is
done.

Refs: #5482
2020-04-29 16:02:00 +08:00
Nadav Har'El
ff5615d59d alternator test: drastically reduce time to boot Scylla
The alternator test, test/alternator/run, runs Scylla and runs the
various tests against it. Before this patch, just booting Scylla took
about 26 seconds (for a dev build, on my laptop). This patch reduces
this delay to less than one second!

It turns out that almost the entire delay was artificial, two periods
of 12 seconds "waiting for the gossip to settle", which are completely
unnecessary in the one-node cluster used in the Alternator test.
So a simple "--skip-wait-for-gossip-to-settle 0" parameter eliminates
these long delays completely.

Amusingly, the Scylla boot is now so fast, that I had to change a "sleep 2"
in the test script to "sleep 1", because 2 seconds is now much more than
it takes to boot Scylla :-)

Fixes #6310.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200428145035.22894-1-nyh@scylladb.com>
2020-04-29 07:55:03 +02:00
Benny Halevy
3b31acfa80 exceptions: drop OVERFLOW_ERROR cql binary protocol extension
Client drivers act differently on errors codes they don't recognize.
Adding new errors codes is considered a protocol extension and
should be negotiated with the client.

This change keeps `overflow_error_exception` internally but uses
the INVALID cql error code to return the error message back to the client
similar to keyspace_not_defined_exception.

We (and cassandra) already use `invalid_request_exception` extensively
to return various errors related to invalid values or types in the query.

Fixes #6264

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Gleb Natapov <gleb@scylladb.com>
Message-Id: <20200422130011.108003-1-bhalevy@scylladb.com>
2020-04-28 12:16:00 +03:00
Piotr Sarna
09e4f3b917 alternator: implement ScanIndexForward
The ScanIndexForward parameter is now fully implemented
and can accept ScanIndexForward=false in order to query
the partitions in reverse clustering order.
Note that reading partition slices in reverse order is less
efficient than forward scans and may put a strain on memory
usage, especially for large partitions, since the whole partition
is currently fetched in order to be reversed.

Fixes #5153
2020-04-28 11:44:46 +03:00
Piotr Sarna
be5d3f4733 Merge 'A bunch of refactors in versioned_value and gossiper' from Kamil
1. Remove the `versioned_value::factory` class, it didn't add any value. It just
   forced us to create an object for making `versioned_value`s, for no sensible
   reason.
2. Move some `versioned_value` deserialization code (string -> internal data
   structures) into the versioned_value module. Previously, it was scattered all
around the place.
3. Make `gossiper::get_seeds` const and return a const reference.

I needed these refactors for a PR I was preparing to fix an issue with CDC. The
attempt of fixing the issue failed (I'm trying something different now), but the
refactors might be useful anyway.

* kbr--vv-refactor:
  gossiper: make `get_seeds` method const and return a const ref
  versioned_value: remove versioned_value::factory class
  gms: move TOKENS string deserialization code into versioned_value
2020-04-28 10:27:45 +02:00
Pavel Solodovnikov
ed7a7554b8 storage_proxy: allow cas() to accept nullptr read_command
This patch allows users of storage_proxy::cas() to supply nullptr
as `query::read_command` which is supposed to skip the procedure
of reading the existing value.

The feature is used in alternator code for Read-Modify-Write
operations: some of them don't require reading previous item
values before updating.

Move `read_nothing_read_command` from alternator code to
storage_proxy layer and fabricate a new no-op command from it when
storage_proxy::cas() is used with nullptr read_command.

This allows to avoid sprinkling if-else branches all over the code
in order to check for null-equality of `cmd`.

We return from storage_proxy::query() very early with an empty
result in case we're given an empty partition_slice (which resides
inside the passed `read_command`) so this approach should be
perfectly fine.

Expand documentation for the `cas()` function to cover new
possible value for `cmd` argument.

Fixes: #6238
Tests: unit(dev, debug)

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200428065235.5714-1-pa.solodovnikov@scylladb.com>
2020-04-28 10:44:19 +03:00
Asias He
35c5ef78b9 repair: Fix hang when the writer is dead
Consdier:

When repair master gets data from repair follower:

1) apply_rows_on_master_in_thread is called
2) a repair writer is created with _repair_writer.create_writer
3) the repair writer fails
4) data is written to the queue _mq[node_idx]->push_eventually attached
   with the writer

Since the writer is dead. No one is going to fetch data from the _mq
queue. The apply_rows_on_master_in_thread will block forever.

To fix, when the writer is failed, we should abort the _mq queue.

Refs: #6248
2020-04-28 12:14:32 +08:00
Raphael S. Carvalho
02e046608f api/service: fix segfault when taking a snapshot without keyspace specified
If no keyspace is specified when taking snapshot, there will be a segfault
because keynames is unconditionally dereferenced. Let's return an error
because a keyspace must be specified when column families are specified.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200427195634.99940-1-raphaelsc@scylladb.com>
2020-04-27 23:37:00 +03:00
Pekka Enberg
3a10bddd7d configure.py: Add '--with-seastar' option
This patch adds a '--with-seastar=<PATH>' option to configure.py, which
allows user to override the default seastar submodule path. This is
useful when building packages from source tarballs, for example.

Message-Id: <20200427165511.6448-1-penberg@scylladb.com>
2020-04-27 20:01:35 +03:00
Rafael Ávila de Espíndola
c7d74a59f5 sstables: Call revert_charges in compaction_write_monitor::write_failed
We still call it in the destructor or to cover the successful case. We
can't do that in on_data_write_completed because it is too early.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
2020-04-27 08:58:31 -07:00
Rafael Ávila de Espíndola
95ee54f3cc sstables: Call monitor->write_failed earlier.
A writer is destroyed just before consume_in_thread returns, since the
adapter takes ownership of it.

The problem is that a monitor can keep a reference to the a
writer_offset_tracker that is owned by that writer.

The monitor is accessed periodically via
backlog_controller::_update_timer. This means we have to deregister
from the list of ongoing writes before the writer is destroyed.

If the write fails, the deregistration happens in write_failed, but it
is currently called after the writer is destroyed.

This patch moves the call to write_failed to the writer destructor as
I could not find a convenient location to put it.

Since the writer is destroyed in consume_in_thread, we could call it
there, but then we also have to update consume.

The is a similar problem with the case where the sstable is written
correctly. That will be fixed in the next patch.

Fixes #6221.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
2020-04-27 08:58:31 -07:00
Rafael Ávila de Espíndola
95acfd1d58 sstables: Add write_failed to the write_monitor interface
Only database_sstable_write_monitor needs it so far, but the call
needs to be moved earlier, which requires calling it in code paths
that don't know about database_sstable_write_monitor.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
2020-04-27 08:58:31 -07:00
Raphael S. Carvalho
5ac0d31323 test: perf_simple_query: fix test with smp count > 1
that code doesn't run under a thread, so let's futurize it.
the code worked with single cpu because get() returns right away
due to no deferring point.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200427155303.82763-1-raphaelsc@scylladb.com>
2020-04-27 18:58:25 +03:00
Pavel Emelyanov
108a944e7b ring_position_ext: Add formatter
It's not currently used, but helped when debugging reworked
row cache lookups.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200427144712.14794-1-xemul@scylladb.com>
2020-04-27 18:01:01 +03:00
Avi Kivity
50e82f523a Update seastar submodule
* seastar 37a22d9de6...8fae03c22d (5):
  > Merge "Reloadable TLS certificates" from Calle
  > future: improve variadic future warning
  > io_queue: deprecated request tracking
  > test: futures_test: adjust to make_ready_future noexcept
  > future: specify make_ready_future as noexcept
2020-04-27 16:35:05 +03:00
Nadav Har'El
858a12755b test.py: run Alternator test with the correct Scylla binary
The Alternator test's run script, test/alternator/run, runs Scylla.
By default, it chooses the last built Scylla executable build/*/scylla.

However, test.py has a "mode" option, that should be able to choose which
build mode to run. Before this patch, this mode option wasn't honored by
the Alternator test, so a "test.py alternator/run" would run the same
Scylla binary (the one last built) three times, instead of running each
of the three build modes.

We fix this in this patch: test.py now passes the "SCYLLA" environment
variable to the test/alternator/run script, indicating the location of the
Scylla binary with the appropriate build mode. The script already supported
this environment variable to override its default choice of Scylla binary.

In test.py, we add to the run_test() function an optional "env" parameter
which can be used to pass additional environment variables to the test.

Fixes #6286

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200427131958.28248-1-nyh@scylladb.com>
2020-04-27 16:23:58 +03:00
Pekka Enberg
fad6712673 dbuild: Improve error message if Docker is not installed
If you run "dbuild" on a freshly installed machine, the error message is
not the most helpful one. Fix it up.

Before:

  $ ./tools/toolchain/dbuild
  ./tools/toolchain/dbuild: line 113: docker: command not found
  ./tools/toolchain/dbuild: line 156: docker: command not found

After:

  $ ./tools/toolchain/dbuild
  dbuild: Please install Docker on this machine to run dbuild.

  Run `./tools/toolchain/dbuild --help' to print the full help message.

Message-Id: <20200426192746.11034-1-penberg@scylladb.com>
2020-04-27 16:22:18 +03:00
Calle Wilund
040ffa6e64 distributed_loader: Add concurrency control override for named keyspaces
Fixes #6202

Distributed loader sstable opening is gated through the
database::sstable_load_concurrency_sem() semaphore
(at a concurrency of 3).

This is (according to creation comment) to reduce memory footprint
during bootstrap, by partially serializing the actual opening of
existing sstables.

However, in certain versions of the product, there exist circular
dependencies between data in some sstables and the ability to actually
read others. Thus when gated as above, we can end up with the
dependents acquiring the semaphore fully, and once stuck waiting for
population of their dependency effectively blocking this from ever
happening.

Since we probably do not want to remove the concurrency control,
and increasing it would only push the problem further away,
we solve the issue by adding the ability to mark certain keyspaces
as "prioritized" (pre-bootstrap), and allow them to populate outside
the normal concurrency control semaphore. Concurrency increase is
however limited to one extra sstable per shard and prio keyspace.

Message-Id: <20200415102431.20816-1-calle@scylladb.com>
2020-04-27 16:21:13 +03:00
Piotr Sarna
d3aba44aea Merge 'cdc: fix the "NoHostAvailable" client error when CL is not met'
from Juliusz.

CL of LOCAL_QUORUM used to be hardcoded into CDC preimage query
and led to an error every time the number of replicas was lower than CL
could require. The solution here is to link the CLs of writes
to base table with the CLs of CDC reads, so the client will get
the (limited) control over the consistency of preimage SELECTs
(instead of constant misleading errors).

The algorithm is as follows:
1. If write that caused CDC activity was done with CL = ANY,
  then do preimage read with CL = ONE.
2. If write that caused CDC activity was done with CL = ALL,
  then do preimage read with CL = QUORUM.
3. SERIAL and LOCAL_SERIAL writes cause preimage read with QUORUM
  and LOCAL_QUORUM, respectively.
4. In other cases do preimage read with the same CL as base write.

To further mitigate the incomprehensible error being sent to client,
I wrapped the preimage's SELECT query in try-catch and
intercept the `unavailable_exception`, which was manifesting as
`NoHostAvailable` in Python and Java drivers. Now client gets a new
error code and a message specific to the issue of CL not being met
by the preimage query.

Fixes #5746

* jul-stas-5746-cdc-replication-factor:
  cdc: fix the "NoHostAvailable" client error when CL is not met
  cdc: CL for preimage select is calculated from base write CL
2020-04-27 14:24:12 +02:00
Juliusz Stasiewicz
d37b3f34f1 cdc: fix the "NoHostAvailable" client error when CL is not met
This commit resolves the client-observable effect of CDC read
consistencies. I wrapped the preimage's SELECT query in try-catch to
intercept the `unavailable_exception`, which led to misleading
`NoHostAvailable` in Python and Java drivers. Now client gets a new
error code and a message specific to the issue of CL not being met
by the preimage query.

Fixes #5746
2020-04-27 13:56:57 +02:00
Piotr Sarna
c32faee657 Merge 'counters: Fix filtering of counters' from Juliusz
Queries with `ALLOW FILTERING` and constraints on counter
values used to be rejected as "unimplemented". The reason
was a missing tri-comparator, which is added in this patch.

Fixes #5635

* jul-stas-5635-filtering-on-counters:
  cql/tests: Added test for filtering on counter columns
  counters: add comparator and remove `unimplemented` from restrictions
2020-04-27 13:53:34 +02:00
Juliusz Stasiewicz
afee590ed7 cql/tests: Added test for filtering on counter columns
Tested predicates: IN, EQ, GE, GT, LE, LT.
Untouched counters are expected to evaluate as 0.
Deleted counters are expected not to appear at all.
2020-04-27 13:36:16 +02:00
Juliusz Stasiewicz
cf2d81bb12 counters: add comparator and remove unimplemented from restrictions
CQL `counter_type_impl` is now made comparable by deserializing it
as an `int64_t`. It allows the use of counters in statement
restrictions.
2020-04-27 13:27:48 +02:00
Avi Kivity
1f902302ad build: replace xxhash submodule with OS package
The xxhash library has been packaged by Fedora, so we can use it
instead of carrying the submodule. This reduces allows us to
receive updates as the OS packages are updated. Build time will
not be reduced since it is a header-only library.

xxhash preserves the hash results across versions so rolling
upgrades will still work.

The frozen toolchain is updated with the new package.

Tests: unit (dev)
2020-04-27 14:00:31 +03:00
Mike Goltsov
068bb3a5bf fix error in fstrim service (scylla_util.py)
On Centos 7 machine:

fstrim.timer not enabled, only unmasked due scylla_fstrim_setup on installation
When trying run scylla-fstrim service manually you get error:

Traceback (most recent call last):
File "/opt/scylladb/scripts/libexec/scylla_fstrim", line 60, in <module>
main()
File "/opt/scylladb/scripts/libexec/scylla_fstrim", line 44, in main
cfg = parse_scylla_dirs_with_default(conf=args.config)
File "/opt/scylladb/scripts/scylla_util.py", line 484, in parse_scylla_dirs_with_default
if key not in y or not y[k]:
NameError: name 'k' is not defined

It caused by error in scylla_util.py

Fixes #6294.
2020-04-27 13:32:11 +03:00
Pavel Solodovnikov
f6e765b70f cql3: pass column_specification via lw_shared_ptr
`column_specification` class is marked as "final": it's safe
to use non-polymorphic pointer "lw_shared_ptr" instead of a
more generic "shared_ptr".

tests: unit(dev, debug)

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Message-Id: <20200427084016.26068-1-pa.solodovnikov@scylladb.com>
2020-04-27 12:47:42 +03:00
Takuya ASADA
811b256f2b redis: add test for setex/ttl 2020-04-27 13:58:33 +09:00
Takuya ASADA
d845fde560 redis: add ttl command
Add ttl command that returns remaining TTL of the key.

See: https://redis.io/commands/ttl
2020-04-27 13:58:33 +09:00
Takuya ASADA
98cae802c0 redis: add setex command
Add setex to set key with TTL.

See: https://redis.io/commands/setex
2020-04-27 13:58:33 +09:00
Pekka Enberg
7304a795e5 scripts/jobs: Keep memory reserve when calculating parallelism
The "jobs" script is used to determine the amount of compilation
parallelism on a machine. It attempts to ensure each GCC process has at
least 4 GB of memory per core. However, in the worst case scenario, we
could end up having the GCC processes take up all the system memory,
forcin swapping or OOM killer to kick in. For example, on a 4 core
machine with 16 GB of memory, this worst case scenario seems easy to
trigger in practice.

Fix up the problem by keeping a 1 GB of memory reserve for other
processes and calculating parallelism based on that.

Message-Id: <20200423082753.31162-1-penberg@scylladb.com>
2020-04-26 19:38:47 +03:00
Piotr Sarna
e17c237feb alternator: fix integer overflow warning in token generation
When generating tokens for parallel scan, debug mode undefined behavior
sanitizer complained that integer overflow sometimes happens when
multiplying two big values - delta and segment number.
In order to mitigate this warning, the multiplication is now split
into two smaller ones, and the generated machine code remains
identical (verified on gcc and clang via compiler explorer).

Fixes #6280
Tests: unit(dev)
2020-04-26 19:06:07 +03:00
Piotr Sarna
c66661c582 table: bypass cache when generating view updates from streaming
There's no indication that data needed for generating view updates
from staging sstables is going to be immediately useful for the
user, and a large amount of it can push hot rows out of the cache,
thus deteriorating performance.

Fixes #6233
Tests: unit(dev)
2020-04-26 15:43:02 +03:00
Rafael Ávila de Espíndola
0d89bbd57f row_cache_alloc_stress_test: Make sure GCC can't delete a new
We want to test that a std::bad_alloc is thrown, but GCC 10 has a new
optimization (-fallocation-dce) that removes dead allocations.

This patch assigns the value returned by new to a global so that GCC
cannot delete it.

With this all tests in a dev build pass with GCC 10.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200424201531.225807-1-espindola@scylladb.com>
2020-04-26 15:22:04 +03:00
Rafael Ávila de Espíndola
543a9ebd9b tests: Wait for a few futures
GCC 10 now warns on these. This fixes the dev build with gcc 10.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200424161006.17857-1-espindola@scylladb.com>
2020-04-26 15:20:40 +03:00
Takuya ASADA
df4fac2849 dist: add scylla_memory_setup
To ask user the host is not shared with another services, then set
"--lock-memory 1" if it's not shared.

Fixes #1393
2020-04-26 13:34:05 +03:00
Rafael Ávila de Espíndola
ac3c1f6c0f configure: Don't use -static-libgcc
The configure option is --static-stdc++, to is surprising that it also
enables -static-libgcc.

Also, -static-libgcc doesn't seem to work with debug builds.

This patch removes -static-libgcc which fixes debug builds with
--static-stdc++. Such builds are convenient for testing new versions
of gcc.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200424214117.257195-1-espindola@scylladb.com>
2020-04-25 19:47:36 +03:00
Tomasz Grabiec
31ccd3750b Update seastar submodule
* seastar b5fb927...37a22d9 (19):
  > io_queue: bring capacity back
  > tls_test: Remove redundant move
  > httpd_test: Remove unused fields
  > everywhere: Remove unused lambda captures
  > rpc: add Doxygen documentation the protocol class
  > build: Pass --create-cc to seastar-json2code.py
  > seastar-json2code: Add a --create-cc option
  > future: move some static_assert()ions from future.hh to future.cc
  > http server: fix date function on non-English locales
  > everywhere: Add messages to static_assert
  > http server: fix "Date" header format
  > future: Fix invalid static_assert
  > fair_queue: remove legacy capacity configuration
  > reactor: fix private 'pollfn' alias
  > defer: include std headers
  > spinlock: add try_lock method
  > testing: Add missing <iostream> include to seastar_test.cc
  > rpc: Avoid excessive number of reallocations when reading compressed frames
  > timer: document
2020-04-23 20:50:27 +02:00
Pavel Emelyanov
98635b74a6 main: Keep feature_service for storage_proxy
Fixes #6250

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200423165608.32419-1-xemul@scylladb.com>
2020-04-23 20:46:36 +02:00
Pavel Emelyanov
83fe0427d2 api/cache_service: Relax getting partitions count
This patch has two goals -- speed up the total partitions
calculations (walking databases is faster than walking tables),
and get rid og row_cache._partitions.size() call, which will
not be available on new _partitions collection implementation.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200423133900.27818-1-xemul@scylladb.com>
2020-04-23 17:47:58 +02:00
Pavel Emelyanov
6ede253479 api/cache_service: Fix get_row_capacity calculation
Current code gets table->row_cache->cache_tracker->region and sums
up the region's used space for all tables found.

The problem is that all row_cache-s share the same cache_tracker
object from the database, thus the resulting number is not correct.

Fix this by walking cache_tracker-s from databases instead.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200423133755.27187-1-xemul@scylladb.com>
2020-04-23 17:05:52 +03:00
Pavel Emelyanov
d3b6f66f50 row_cache: Remove unused invalidate_unwrapped()
Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200423133557.27053-1-xemul@scylladb.com>
2020-04-23 17:04:31 +03:00
Rafael Ávila de Espíndola
e6f4996e44 atomic_vetor: Don't pass references to callbacks
This is more strict than it needs to be, but it avoids any bugs like
the one fixed by the previous patch.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200422182304.120906-2-espindola@scylladb.com>
2020-04-23 16:06:37 +03:00
Rafael Ávila de Espíndola
d8555513a9 gms: Don't keep references to reallocated vector entries
These callbacks can block a seastar thread and the underlying vector
can be reallocated concurrently.

This is no different than if it was a plain std::vector and the
solution is similar: use values instead of references.

Fixes #6230

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200422182304.120906-1-espindola@scylladb.com>
2020-04-23 16:06:36 +03:00
Rafael Ávila de Espíndola
fbcf741c2d cql functions: Use switch to find the cast function to use
This produces more compact code and avoids the anti-pattern of
building a map with statically known values. If the values are given
to GCC via a switch statement it can do a much better job at compile
time than libstdc++ can at runtime.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200422224905.198794-1-espindola@scylladb.com>
2020-04-23 11:46:09 +03:00
Nadav Har'El
1f75efb556 alternator: use RF=3 even if some nodes are temporarily down
Alternator is supposed to use RF=3 for new tables. Only when the cluster is
smaller than 3 nodes do we use RF=1 (and warn about it) - this is useful for
testing.

However, our implementation incorrectly tested the number of *live* nodes in
the cluster instead of the total number of nodes. As a result, if a 3-node
cluster had one node down, and a new table was created, it was created with
RF=1, and immediately could not be written because when RF=1, any node down
means part of the data is unavailable.

This patch fixes this: The total number of nodes in the cluster - not the
number of live nodes - is consulted. The three-node-cluster-with-a-dead-node
setup above creates the table with RF=3, and it can be written because two
living nodes out of three are enough when RF=3 and we do quorum writes and
reads.

We have a dtest to reproduce this bug (and its fix), and it's also easy to
reproduce manually by starting a 3-node cluster, killing one of the nodes,
and then running "pytests". Before this patch, the tests can create tables
but then fail to write to them. After this patch, the test succeed on the
same cluster with the dead node.

Fixes #6267

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200422182035.15106-2-nyh@scylladb.com>
2020-04-23 08:23:05 +02:00
Nadav Har'El
08c39bde1a gossiper: add convenience function for getting number of nodes
The gossiper has a convenience functions get_up_endpoint_count() and
get_down_endpoint_count(), but strangely no function to get the total
number. Even though it's easy to calculate the total by summing up their
result it is inefficient and also incovenient because of of these
functions returns a future.

So let's add another function, get_all_endpoint_count(), to get the
total number of nodes. We will use this function in the next patch.

Signed-off-by: Nadav Har'El <n...@scylladb.com>
Message-Id: <20200422182035.15106-1-nyh@scylladb.com>
2020-04-23 08:23:05 +02:00
Nadav Har'El
86fadd700f docs: Alternator parallel scan is supported now
After fixing issue #6260, the "parallel scan" feature in Alternator is
supported, so drop the sentence in alternator.md saying that it isn't.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200422090738.21648-1-nyh@scylladb.com>
2020-04-23 08:16:16 +02:00
Nadav Har'El
92e36c5df5 test/alternator: increase timeout on Scylla boot
The Alternator test boots Scylla to test against it. We set an arbitrary
timeout for this boot to succeed: 100 seconds. This 100 seconds is
significantly more than 25 seconds it takes on my laptop, and I though
we'll never reach it. But it turns out that in some setups - running the
very slow debug build on slow and overcommitted nodes - 100 seconds is
not enough.

So this patch doubles the timeout to 200 seconds.

Note that this "200 seconds" is just a timeout, and doesn't affect normal
runs: Both a successful boot and a failed boot are recognized as soon as
they happen, and we never unnecessarily wait the entire 200 seconds.

Fixes #6271.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200422193920.17079-1-nyh@scylladb.com>
2020-04-23 07:47:21 +02:00
Piotr Jastrzebski
0416d70c9f cdc:use CDCPartitioner for CDC Log
This will allow deterministic stream_id generation
and would remove the risk of not being able to generate
a stream id for some vnode.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-22 18:25:51 +02:00
Piotr Jastrzebski
1d1c6af72a dht: Add find_first_token_for_shard
This new function finds the first token in range (start, end] that
belongs to given shard.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-22 18:24:54 +02:00
Piotr Jastrzebski
c82adb7906 dht: use long_token in token::to_int64
Previous implementation of to_int64 wasn't handling dht::minimum_token
and dht::maximum_token.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-22 16:12:00 +02:00
Tomasz Grabiec
c59ec8d97f Merge "Avoid some memory copies in lwt" from Gleb
* seastar-dev.git gleb/lwt-shared-proposal:
 lwt: pass paxos::proposal as a shared pointer everywhere
 lwt: do not copy proposal in paxos_state::accept
 lwt: make load_paxos_state to take partition_key_view instead of a
    deference
2020-04-22 13:43:03 +02:00
Gleb Natapov
97af6bb0bd lwt: make load_paxos_state to take partition_key_view instead of a deference
Some caller have partition_key_view, but not partition_key, so thy need
to create a temporary and copy just to pass a reference. Change it by
accepting a view.
2020-04-22 13:51:43 +03:00
Gleb Natapov
c970da3811 lwt: do not copy proposal in paxos_state::accept
A proposal is passed as a reference and all callers have it in stable
memory until the call ends, so it is safe to use the reference
everywhere.
2020-04-22 13:51:43 +03:00
Gleb Natapov
fbb04698d0 lwt: pass paxos::proposal as a shared pointer everywhere
paxos::proposal reference is passed into a lot of functions and sometimes
it has to be copied to prolong its lifetime. Create it as a shared
pointer and pass it everywhere to avoid those copies.
2020-04-22 13:51:43 +03:00
Calle Wilund
525b283326 commitlog::read_log_file: Preserve subscription across reading
Fixes #6265

Return type for read_log_file was previously changed from
subscription to future<>, returning the previously returned
subscriptions result of done(). But it did not preserve the
subscription itself, which in turn will cause us to (in
work::stream), call back into a deleted object.

Message-Id: <20200422090856.5218-1-calle@scylladb.com>
2020-04-22 12:12:11 +03:00
Asias He
8b7189f2dd mutation_writer_test: Add test_multishard_writer_producer_aborts
Without the patch "multishard_writer: Abort the queue attached to consumers
when producer fails", the test would hang forever.

Fixes #6241
2020-04-22 16:28:07 +08:00
Piotr Sarna
dbb9574aa2 alternator: allow parallel scan
Parallel scans can be performed by providing Segment and TotalSegments
attributes to Scan request, which can be used to split the work among
many workers.
This test makes the parallel scan test succeed, so the xfail is removed.

Fixes #5059
2020-04-22 11:06:15 +03:00
Botond Dénes
e778b072b1 read_command: use bool_class for is_first_page parameter
The constructor of `read_command` is used both by IDL and clients in the
code. However, this constructor has a parameter that is not used by IDL:
`read_timestamp`. This requires that this parameter is the very last in
the list and that new parameters that are used by IDL are added before
it. One such new parameter was `bool is_first_page`. Adding this
parameter right before the read timestamp one created a situation where
the last parameter (read_timestamp) implicitly converts to the one
before it (is_first_page). This means that some call sites passing
`read_timestamp` were now silently converting this to `is_first_page`,
effectively dropping the timestamp.

This patch aims to rectify this, while also avoiding similar accidents
in the future, by making `is_first_page` a `bool_class` which doesn't
have any implicit convertions defined. This change does not break the
ABI as `bool_class` is also sent as a `bool` on the wire.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Tests: unit(dev)
Message-Id: <20200422073657.87241-1-bdenes@scylladb.com>
2020-04-22 11:01:22 +03:00
Rafael Ávila de Espíndola
45ee52724c cql functions: Don't use a std::function for casts
Casts only depend on their operands, so a plain function pointer is
sufficient. This allows replacing all the make_castas_* functions that
return a lambda with plain castas_* functions that do the casting.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200413162014.23884-2-espindola@scylladb.com>
2020-04-22 10:44:56 +03:00
Glauber Costa
1f9c37fb5e view_updating_consumer: move reference to a pointer
It is currently not possible to wrap the view_updating_consumer in an
std::optional. I intend to do it to allow for compactions to optionally
generate view updates.

The reason for that is that view_updating_consumer has a reference as a
member, which makes the move assignment constructor not be implicitly
generated.

This patch fixes it by keeping a pointer instead of a reference.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200421123648.8328-1-glauber@scylladb.com>
2020-04-22 10:05:35 +03:00
Botond Dénes
7dabf75682 service: messaging_service: resolve rpc set_logger deprecation warning
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200407091413.310764-1-bdenes@scylladb.com>
2020-04-22 10:05:35 +03:00
Piotr Jastrzebski
7884eada1a cdc: add CDCPartitioner
This is a special partitioner that will be used by
CDC Log. It works only with partition key that is blob
composed of two ints. The first int is a token this
partitioner will map the key to. The second int is there
to make it possible to create multiple keys that are different
from each other but map to the same token.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-21 15:50:22 +02:00
Piotr Jastrzebski
330cd162f0 stream_id: add token_from_bytes static function
This function will be used by CDCPartitioner to
extract token from partition key.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-21 15:50:22 +02:00
Piotr Jastrzebski
ae1f14095f i_partitioner: Stop distinguishing whether keys order is preserved
Scylla inherited a concept of partitioners that preserve order of keys from
the origin but it is not used for anything. Moreover, none of the existing
partitioners preserves keys order. The only partitioner that did this in the
past was ByteOrderedPartitioner and Scylla does not support it any more.

For a partitioner to preserve an order of the keys means that if there are two
keys A and B such that A < B then token(A) < token(B) where token(X) isa token
the partitioner assignes to key X.

This patch removes dht::i_partitioner::preserves_order with all its overrides.
The only place that was using this member function was a check in thrift server
and it is safe to remove the check because the check was only done
to differentiate the error message for partitioners that do and do not preserve
the order of the keys.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-21 15:50:22 +02:00
Botond Dénes
c9d3053e91 test/boost: castas_fcts_test: add test for identity casts
aa9a582f4 allowed all types to be cast to themselves, but didn't add a
unit test for this. This patch rectifies this.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200421125902.1709684-1-bdenes@scylladb.com>
2020-04-21 15:10:28 +02:00
Juliusz Stasiewicz
c70311f73e cdc: CL for preimage select is calculated from base write CL
CL of LOCAL_QUORUM used to be hardcoded into CDC preimage query
and led to an error when number of replicas was lower than CL
would require. The solution here is to link the CLs of writes
to base table with the CLs of CDC reads, so the client will get
the (limited) control over the consistency of preimage SELECTs
(instead of getting error every time).

The algorithm is as follows:
1. If write that caused CDC activity was done with CL = ANY,
  then do preimage read with CL = ONE.
2. If write that caused CDC activity was done with CL = ALL,
  then do preimage read with CL = QUORUM.
3. SERIAL and LOCAL_SERIAL writes cause preimage read with QUORUM
  and LOCAL_QUORUM, respectively.
4. In other cases do preimage read with the same CL as base write.
2020-04-21 14:33:36 +02:00
Avi Kivity
2482e53de9 test: alternator: configure scylla for test environment in terms of cpu and disk
Currently, the alternator tests configure scylla to use all the
logical cores in the host system, but only 1GB of RAM. This can lead
to a small amount of memory per core.

It also uses the default disk configuration, which is safe, but can be
very slow on mechanical or non-enterprise disks.

Change to use a fixed --smp 2 configuration, and add --overprovisioned
for maximum flexibility (no spinning). Use --unsafe-bypass-fsync
for faster performance on non-enterprise or mechanical disks, assuming
that the test data is not important.

Fixes #6251.
Message-Id: <20200420154112.123386-1-avi@scylladb.com>
2020-04-20 18:50:46 +03:00
Nadav Har'El
44a1daf025 merge: Allow accessing Scylla system tables from alternator
Merged patch series from Piotr Sarna:

This series allows reading rows from Scylla's system tables
via alternator by using a virtual interface.
If a Query or Scan request intercepts a table name with the following
pattern: .scylla.alternator.KEYSPACE_NAME.TABLE_NAME, it will read
the data from Scylla's KEYSPACE_NAME.TABLE_NAME table.
The interface is expected to only return data for Scylla system tables
and trying to access regular tables via this interface is expected
to return an error.
This series comes with tests (alternator-test, scylla_only).

Fixes #6122
Tests: alternator-test(local,remote (to verify that scylla_only works)

Piotr Sarna (5):
  alternator: add fallback serialization for all types
  alternator: add fetching static columns if they exist
  alternator: add a way of accessing system tables from alternator
  alternator-test: add scylla-only test for querying system tables
  docs: add an entry about accessing Scylla system tables

 alternator-test/test_system_tables.py | 61 +++++++++++++++++++++++++++
 alternator/executor.cc                | 38 ++++++++++++++++-
 alternator/executor.hh                |  1 +
 alternator/serialization.cc           | 11 +++--
 docs/alternator/alternator.md         | 15 +++++++
 5 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 alternator-test/test_system_tables.py
2020-04-20 18:21:20 +03:00
Piotr Sarna
03f41b9d96 db: remove trailing whitespace
Found when backporting a patch to 3.3.
Message-Id: <fa406597deaacff56dbba99fa167715b041bbb52.1587375123.git.sarna@scylladb.com>
2020-04-20 12:58:55 +02:00
Kamil Braun
d73a21057a gossiper: make get_seeds method const and return a const ref 2020-04-20 12:57:16 +02:00
Kamil Braun
1f7290a0ff versioned_value: remove versioned_value::factory class
If there was a Most Useless Abstraction award, this would be a good
candidate.
2020-04-20 12:57:16 +02:00
Kamil Braun
113384b6f8 gms: move TOKENS string deserialization code into versioned_value
And do the same with CDC_STREAMS_TIMESTAMP.

The code that took a list of tokens represented as a string inside
versioned_value (for gossiping) and deserialized it into
an `unordered_set<dht::token>` lived in the storage_service module,
while the code that did the serializing (set -> string) lived in
versioned_value. There was a similar situation with the CDC generation
timestamp.

To increase maintanability and reusability, the deserialization code is
now placed next to the serialization code in versioned_value.

Furthermore, the `make_full_token_string`, `make_token_string`, and
`make_cdc_streams_timestamp_string` (serialization functions) are moved
out of versioned_value::factory and made static methods of
versioned_value instead.
2020-04-20 12:57:13 +02:00
Tomasz Grabiec
e648e314e5 Merge "Drop only learnt value on PRUNE" from Gleb
It is unsafe to remove entire row, so only drop learn value from
system.paxos table.

Fixes: #6154
2020-04-20 12:06:04 +02:00
Asias He
d86958d3b2 multishard_writer: Abort the queue attached to consumers when producer fails
We have this in multishard_writer:

future<uint64_t> multishard_writer::operator()() {
    return distribute_mutation_fragments().finally([this] {
        return wait_pending_consumers();
    }).then([this] {
        return _consumed_partitions;
    });
}

The wait_pending_consumers which waits for the consumers to finish is
called even when distribute_mutation_fragments fails.

When distribute_mutation_fragments fails and the failure is due to the
producer fails, consumers can wait for data which will never come because
the producer has failed already. This can cause a deadlock.

To fix, when distribute_mutation_fragments fails, we should abort the
queues that are attached to the readers used by the consumers.

Fixes #6241
2020-04-20 14:53:24 +08:00
Piotr Jastrzebski
2aaf81bf7c dht: Exclude -2^63 value from get_random_token
-2^63 is a value reserved for min/max token boundaries and shouldn't be used for
regular tokens. This patch fixes get_random_token to never create token with
value -2^63. On the way dht::get_random_number template method is removed
because it was exclusively used by get_random_token.

Also use uniform_int_distribution with int64_t instead of uint64_t by using
correct constructor parameter that guarantees values between -2^63+1 and 2^63-1
inclusively.

Tests: unit(dev)

Fixes #6237.
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
Message-Id: <0a1a939355f5005039d5c2c7c513bad94cf60be2.1587302093.git.piotr@scylladb.com>
2020-04-19 18:17:35 +03:00
Gleb Natapov
73391420fb lwt: drop only most recently learnt value during prune.
It turned out we cannot drop the information about most recent commit
entirely since it is used to cut off already outdate accepted values.
Otherwise the following scenario can happen:

1. cas1 prepares on A, B, C, gets one accept from A
2. cas2 prepares on B, C, gets 2 accepts on B and C, learns on B, C
3. cas3 initiates a prepare on A, learns about cas1's accept,
4. cas2 learns on A, prunes on A, B, C

Now cas3 will reply cas1's value because it does not know that it is
less than already committed on (removed during step 4).

The patch drops only committed value and keep the information about
latest committed ballot.

Fixed #6154
2020-04-19 17:12:15 +03:00
Gleb Natapov
d3d31d66d4 lwt: treated accepted ballot as a promised
PAXOS node is allowed to accept a proposal without promising it
first as long as its ballot is greater than already promised one. Treat
such accepted ballot as promised since 'learn' stage removes accepted
ballot, but we still want to remember it as the latest promised one.

The goal is to be closer to formal PAXOS specification.
2020-04-19 17:12:03 +03:00
Raphael S. Carvalho
c350b864e8 compaction: Short-circuit TWCS interposer if only a single time window is needed
If we know in advance that only a single window is needed, the TWCS interposer
can be short-circuited.

perf_sstable shows up to ~14% performance regression in compaction with interposer
enabled for a table with schema containing 10 columns.

no interposer (50k partitions)
81090.77 +- 33.82 partitions / sec (100 runs, 1 concurrent ops)

TWCS interposer (50k partitions)
71149.80 +- 26.06 partitions / sec (100 runs, 1 concurrent ops)

no interposer (100k partitions)
83791.13 +- 22.65 partitions / sec (100 runs, 1 concurrent ops)

TWCS interposer (100k partitions)
72147.81 +- 13.39 partitions / sec (100 runs, 1 concurrent ops)

command used:
./build/dev/test/perf/perf_sstable --num_columns 10 --partitions 100000 \
--iterations 100 --mode compaction --sstables 1 --testdir /home/fedora/xfs \
--smp 1 --cpuset 3-3 --poll-mode

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200409194235.6004-3-raphaelsc@scylladb.com>
2020-04-19 17:06:05 +03:00
Raphael S. Carvalho
3edff36cd2 compaction: Fix partition estimation with TWCS interposer
Max and min windows are microsecond timestamps, which should be divided
by window size in microseconds to properly estimate window count
based on provided mutation_source_metadata.

Found this problem after properly setting mutation_source_metadata with
min and max metadata on behalf of regular compaction.

Fixes #6214.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200409194235.6004-2-raphaelsc@scylladb.com>
2020-04-19 17:04:48 +03:00
Avi Kivity
1e2b3f7eb4 Merge "memory_footprint_test improvements" from Tomasz
"
Includes:

 - code cleanups
 - support for measuring data stores with more than one partition
 - measure sstable footprint for all supported formats
 - less verbose mode by default
"

* tag 'memory-footprint-test-improvement-v2' of github.com:tgrabiec/scylla:
  test: memory_footprint: Silence logging by default
  test: memory_footprint: Introduce --partition-count option
  test: memory_footprint: Run under a cql_test_env
  test: memory_footprint: Calculate sstable size for each format version
  sstables: Move all_sstable_versions to version.hh
2020-04-19 17:03:02 +03:00
Piotr Sarna
9c15604659 treewide: deprecate passing explicit order in schema building
In order to avoid confusion with regard to whose responsibility
it is to sort the key columns (see #5856), the interface which allows
adding columns to the builder with explicit column id
is moved to a private function. An internal with_column_ordered()
overload is maintained to be used for internal operations,
but it's encouraged to use simpler with_column() in new code.

Fixes #6235
Tests: unit(dev)
2020-04-19 16:19:17 +03:00
Botond Dénes
a4aa753f0f schema: schema(): use std::stable_sort() to sort key columns
When multiple key columns (clustering or partition) are passed to
the schema constructor, all having the same column id, the expectation
is that these columns will retain the order in which they were passed to
`schema_builder::with_column()`. Currently however this is not
guaranteed as the schema constructor sort key columns by column id with
`std::sort()`, which doesn't guarantee that equally comparing elements
retain their order. This can be an issue for indexes, the schemas of
which are built independently on each node. If there is any room for
variance between for the key column order, this can result in different
nodes having incompatible schemas for the same index.
The fix is to use `std::stable_sort()` which guarantees that the order
of equally comparing elements won't change.

This is a suspected cause of #5856, although we don't have hard proof.

Fixes: #5856
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
[avi: upgraded "Refs" to "Fixes", since we saw that std::sort() becomes
      unstable at 17 elements, and the failing schema had a
      clustering key with 23 elements]
Message-Id: <20200417121848.1456817-1-bdenes@scylladb.com>
2020-04-19 13:42:44 +03:00
Nadav Har'El
7e7c688946 docs/alternator/alternator.md: fix typos
Fix a couple of typos in the Alternator documentation.
Fixes scylladb/scylla-doc-issues#280
Fixes scylladb/scylla-doc-issues#281

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200419091900.23030-1-nyh@scylladb.com>
2020-04-19 11:19:26 +02:00
Piotr Sarna
a6cf0bfa7d table: switch to correct io_priority for streaming view updates
The io_priority parameter used when generating view updates from
streaming is used by the sstable reader, so it should use the I/O priority
for streaming *read* operations, not streaming *write* operations.

Fixes #6231
Tests: unit(dev)
2020-04-19 09:56:43 +03:00
Rafael Ávila de Espíndola
f3fd466156 dht: Use get_random_number<uint64_t> instead of int64_t in token::get_random_token
I bisect the opposite change in
9c202b52da as the cause of issue 6193. I
don't know why. Maybe get_random_number<signed_type> is buggy?

In any case, reverting to uint64_t solves the issue.

Fixes #6193

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200418001611.440733-1-espindola@scylladb.com>
2020-04-19 09:46:06 +03:00
Alejo Sanchez
bd849764e0 utils: error injection sleep add support for manual_clock
Requested by @tgrabiec in previous patch (already merged).

Adds support for sleep using manual clock.
Add test.

NOTE: Removes system_clock support (and test) as sleep is not explicitly
      instantiated in seastar/src/core/reactor.cc

Branch URL: https://github.com/alecco/scylla/tree/error_injection_5_manual_clock

Tests: unit ({dev})

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200417081518.868900-1-alejo.sanchez@scylladb.com>
2020-04-17 11:45:05 +02:00
Tomasz Grabiec
92771e904a test: memory_footprint: Silence logging by default 2020-04-17 11:34:13 +02:00
Tomasz Grabiec
1df63b60c3 test: memory_footprint: Introduce --partition-count option 2020-04-17 11:34:13 +02:00
Tomasz Grabiec
7c2f6dd75e test: memory_footprint: Run under a cql_test_env 2020-04-17 11:34:13 +02:00
Tomasz Grabiec
04c093cbec test: memory_footprint: Calculate sstable size for each format version 2020-04-17 11:34:12 +02:00
Tomasz Grabiec
3e74dd4df3 sstables: Move all_sstable_versions to version.hh 2020-04-17 11:34:02 +02:00
Rafael Ávila de Espíndola
3586324a61 sstables: Delete never overwritten methods
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Reviewed-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200417012330.246071-1-espindola@scylladb.com>
2020-04-17 09:16:16 +03:00
Avi Kivity
2039b79664 commitlog: filter out files in the commitlog directory which don't have the correct prefix
Commitlog replay is given a filename prefix to filter files against, but it
ignores it. As a result we will replay anything in that directory, including
recycled segments, which is wasteful.

Fix by adding a check for the prefix.

Tests: unit (dev), manual test that regular commitlog files are not
       filtered.
Message-Id: <20200416174542.133230-1-avi@scylladb.com>
2020-04-17 08:44:32 +03:00
Kamil Braun
3d811e2f95 sstables: freeze types nested in collection types in legacy sstables
Some legacy `mc` SSTables (created in Scylla 3.0) may contain incorrect
serialization headers, which don't wrap frozen UDTs nested inside collections
with the FrozenType<...> tag. When reading such SSTable,
Scylla would detect a mismatch between the schema saved in schema
tables (which correctly wraps UDTs in the FrozenType<...> tag) and the schema
from the serialization header (which doesn't have these tags).

SSTables created in Scylla versions 3.1 and above, in particular in
Scylla versions that contain this commit, create correct serialization
headers (which wrap UDTs in the FrozenType<...> tag).

This commit does two things:
1. for all SSTables created after this commit, include a new feature
   flag, CorrectUDTsInCollections, presence of which implies that frozen
   UDTs inside collections have the FrozenType<...> tag.
2. when reading a Scylla SSTable without the feature flag, we assume that UDTs
   nested inside collections are always frozen, even if they don't have
   the tag. This assumption is safe to be made, because at the time of
   this commit, Scylla does not allow non-frozen (multi-cell) types inside
   collections or UDTs, and because of point 1 above.

There is one edge case not covered: if we don't know whether the SSTable
comes from Scylla or from C*. In that case we won't make the assumption
described in 2. Therefore, if we get a mismatch between schema and
serialization headers of a table which we couldn't confirm to come from
Scylla, we will still reject the table. If any user encounters such an
issue (unlikely), we will have to use another solution, e.g. using a
separate tool to rewrite the SSTable.

Fixes #6130.
2020-04-16 18:44:56 +03:00
Avi Kivity
141bd44982 Update seastar submodule
* seastar f846a348b...b5fb92739 (3):
  > Merge 'file utils infrastructure' from Benny
  > future: future_state: make exception constructors noexcept
  > timer: add scheduling_group awareness
Fixes #6170.
2020-04-16 15:20:50 +03:00
Nadav Har'El
606ae0744c docs, alternator: alternator.md cleanup
Clean up the alternator.md document, by:

* Updating out-of-date information that outstayed its welcome.
* When Scylla does have a feature but it's just not supported via the
  DynamoDB API (e.g., CDC and on-demand backups) mention that.
* Remove mention of Alternator being experimental and users should not
  store important data on it :-)
* Miscellaneous cleanups.

Fixes #6179.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200412094641.27186-1-nyh@scylladb.com>
2020-04-16 13:39:28 +02:00
Rafael Ávila de Espíndola
3b8e84731b configure: Make the stack usage warning more strict
All the dev and release warning at the previous level have been fixed,
so tighten the warning a bit.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200413212241.365022-1-espindola@scylladb.com>
2020-04-16 09:02:22 +03:00
Vlad Zolotarov
b83e84b467 db::hints:: optimize with_file_update_mutex()
Avoid extra shared_ptr copy.

Signed-off-by: Vlad Zolotarov <vladz@scylladb.com>
Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200311214313.2988-1-vladz@scylladb.com>
2020-04-16 09:01:40 +03:00
Piotr Sarna
71ac6ebcc5 Merge 'prepare the view building generator to work through a compaction' from Glauber
There is no reason to read a single SSTable at a time from the staging
directory. Moving SSTables from staging directory essentially involves
scanning input SSTables and creating new SSTables (albeit in a different
directory).

We have a mechanism that does that: compactions. In a follow up patch, I
will introduce a new specialization of compaction that moves SSTables
from staging (potentially compacting them if there are plenty).

In preparation for that, some signatures have to be changed and the
view_updating_consumer has to be more compaction friendly. Meaning:
    - Operating with an sstable vector
    - taking a table reference, not a database

Because this code is a bit fragile and the reviewer set is fundamentally
different from anything compaction related, I am sending this separately

* glommer-view_build:
  staging: potentially read many SSTables at the same time
  view_build_test: make sure it works with smp > 1
2020-04-15 18:07:09 +02:00
Glauber Costa
4e6400293e staging: potentially read many SSTables at the same time
There is no reason to read a single SSTable at a time from the staging
directory. Moving SSTables from staging directory essentially involves
scanning input SSTables and creating new SSTables (albeit in a different
directory).

We have a mechanism that does that: compactions. In a follow up patch, I
will introduce a new specialization of compaction that moves SSTables
from staging (potentially compacting them if there are plenty).

In preparation for that, some signatures have to be changed and the
view_updating_consumer has to be more compaction friendly. Meaning:
- Operating with an sstable vector
- taking a table reference, not a database

Because this code is a bit fragile and the reviewer set is fundamentally
different from anything compaction related, I am sending this separately

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-15 11:26:44 -04:00
Glauber Costa
94d6b75a27 view_build_test: make sure it works with smp > 1
This test doesn't work with higher smp counts, because it relies on
dealing with keys named 'a' and 'b' and creates SSTables containing one
of them manually. This throws an exception if we happen to execute on
a shard that don't own the tokens corresponding to those keys.

This patch avoids that problem by pre-selecting keys that we know to
belong to the current shard in which the test is executed.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-15 10:53:32 -04:00
Konstantin Osipov
18b9bb57ac lwt: rename metrics to match accepted terminology
Rename inherited metrics cas_propose and cas_commit
to cas_accept and cas_learn respectively.

A while ago we made a decision to stick to widely accepted
terms for Paxos rounds: prepare, accept, learn. The rest
of the code is using these terms, so rename the metrics
to avoid confusion/technical debt.

While at it, rename a few internal methods and functions.

Fixes #6169

Message-Id: <20200414213537.129547-1-kostja@scylladb.com>
2020-04-15 12:20:30 +02:00
Piotr Jastrzebski
20bc93b941 cdc: Stop storing CDC options in scylla tables
Initially we were storing CDC options in scylla tables but then we realized
that we can use schema extensions. Extensions are more flexible and cause less
problems with schema digest.

The transition was done in 4.0 and with that we stopped reading 'cdc' column
in scylla tables. Commit 861c7b5626 removed
the code that used to read 'cdc' column.

Since no Scylla node should be reading 'cdc' column, we can always keep
it empty now. This will allow removal of schema::cdc_options in the future.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-15 06:56:44 +02:00
Benny Halevy
35892e4557 db::commitlog: close file if wrapping failed
When I/O error (e.g. EMFILE / ENOSPC) happens we hit
an assert in ~append_challenged_posix_file_impl(): Assertion _closing_state == state::closed' failed.

Commit 6160b9017d add close on failure
of the lamda defined in allocate_segment_ex, but it doesn't handle an error
after the file is opened/created while it is wrapped with commitlog_file_extensions.

Refs #5657

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Calle Wilund <calle@scylladb.com>
Message-Id: <20200414115231.298632-1-bhalevy@scylladb.com>
2020-04-14 16:14:28 +03:00
Calle Wilund
a62d75fed5 commitlog_test: Ensure "when_over_disk_limit" reads segment list only once
Fixes #6195

test_commitlog_delete_when_over_disk_limit reads current segment list
in flush handler, to compare with result after allowing deletetion of
segement. However, it might be called more than once in rare cases,
because timing and us using rather small sizes.

Reading the list the second time however is not a good idea, because
it might just very well be exactly the same as what we read in the
test check code, and we actually overwrite the list we want to
check against. Because callback is on timer. And test is not.

Message-Id: <20200414114322.13268-1-calle@scylladb.com>
2020-04-14 15:31:08 +03:00
Avi Kivity
40459fea0e Merge "compound-compat: composite::iterator: cover error paths with on_internal_error()" from Botond
"
This is a continuation of recent efforts to cover more and more internal
de-serialization paths with `on_internal_error()`. Errors like this
should always be investigated but this can only be done with a core.
This patch covers the error paths of `composite::iterator` with
`on_internal_error()`. As we need this patch to investigate a 4.0
blocker issue (#6121) it only does the minimal amount of changes needed
to allow generating a core for de-serializiation failures of composites.
There are a few FIXMEs left in the code that I plan to address in a
follow-up.

Ref: #6121
"

* 'compound-on-internal-error/v1' of https://github.com/denesb/scylla:
  compound_compat: composite::iterator cover error-paths with on_internal_error()
  compound_compat: composite_view: add is_valid()
2020-04-14 14:06:54 +03:00
Avi Kivity
ba6653f60c Update seastar submodule
* seastar cce2ddac1...f846a348b (3):
  > rpc: always shutdown socket when stopping a client
Fixes #6060.
  > reactor: Deprecate cpu_id
  > httpd: switch main() to use seastar::async
2020-04-14 13:31:48 +03:00
Piotr Dulikowski
ff80b7c3e2 cdc: do not change frozen list type in cdc log table
For a column of type `frozen<list<T>>` in base table, a corresponding
column of type `frozen<map<timeuuid, T>>` is created in cdc log.

Although a similar change of type takes place in case of non-frozen
lists, this is unneeded in case of frozen lists - frozen collections are
atomic, therefore there is no need for complicated type that will be
able to represent a column update that depends on its previous value
(e.g. appending elements to the end of the list).

Moreover, only cdc log table creation logic performs this type change
for frozen lists. The logic of `transformer::transform`, which is
responsible for creation of mutations to cdc log, assumes that atomic
columns will have their types unchanged in cdc log table. It simply
copies new value of the column from original mutation to the cdc log
mutation. A serialized frozen list might be copied to a field that is of
frozen map type, which may cause the field to become impossible to
deserialize.

This patch causes frozen list base table columns to have a corresponding
column in cdc log with the same type.

A test is added which asserts that the type of cdc log columns is not
changed in the case of frozen base columns.

Tests: unit(dev)
Fixes #6172
2020-04-14 09:44:22 +02:00
Piotr Sarna
0638699ffd Merge 'test.py: run Alternator tests' from Nadav
We have in alternator-test a set of over 340 functional tests for
Alternator. These tests are written in Python using the pytest
framework, expect Scylla to be running and connect to it using the
DynamoDB API with the "boto3" library (the AWS SDK for Python).

We have a script alternator-test/run which does everything needed
to run all these tests: Starts Scylla with the appropriate parameters
in a temporary directory, runs all the tests against it, and makes
sure the temporary directory is removed (regardless of whether the
tests succeeded or failed).

The goal of this small patch series is to integrate these Alternator
tests into test.py in a *simple* way. The idea is that we add *one*
test which just runs the aforementioned "run" script which does its
own business.

The changes we needed to do in this series to achieve this are:

1. Make the alternator-test/run script pick a unique IP address on which
   to listen, instead of always using 127.0.0.1. This allows running
   this test in parallel with dtest tests, or even parallel to itself.

2. Move the alternator-test directory to test/alternator. This is
   the directory where test.py expects all the tests to live in.
   It also makes sense - since we already have multiple subdirectories
   in test/, to put the Alternator tests there too.

3. Add a new test suite type, "Run". A "Run" suite is simply a directory
   with a script called "run", and this script is run to run the entire
   suite, and this script does its own business.

4. Tests (such as the new "Run" ones) who can be killed gently and clean
   up after themselves, should be killed with SIGTERM instead of
   SIGKILL.

After this series, to run the Alternator tests from test.py, do:

        ./test.py --mode dev alternator

Note that in this version, the "--mode" has no effect - test/alternator/run
always runs the latest compiled Scylla, regardless of the chosen mode.
This can be fixed later.

The Alternator tests can still be run manually and individually against
a running Scylla or DynamoDB as before - just go to the test/alternator
directory and run "pytest" with the desired parameters.

Fixes #6046

* nyh/alternator-test-v3:
  alternator-test: make Alternator tests runnable from test.py
  test.py: add xunit XML output file for "Run" tests
  test.py: add new test type "Run"
  test.py: flag for aborting tests with SIGTERM, not SIGKILL
  alternator-test: change "run" script to pick random IP address
  alternator-test: add "--url" option to choose Alternator's URL
2020-04-14 07:56:37 +02:00
Kamil Braun
5a454663fd sstables: move definition of column_translation::state::build to a .cc file 2020-04-13 17:45:25 +03:00
Asias He
13a9c5eaf7 repair: Send reason for node operations
Since 956b092012 (Merge "Repair based node
operation" from Asias), repair is used by other node operations like
bootstrap, decommission and so on.

Send the reason for the repair, so that we can handle the materialized
view update correctly according to the reason of the operation. We want
to trigger the view update only if the repair is used by repair
operation. Otherwise, the view table will be handled twice, 1) when the
view table is synced using repair 2) when the base table is synced using
repair and view table update is triggered.

Fixes #5930
Fixes #5998
2020-04-13 13:47:26 +03:00
Takuya ASADA
f24c13f2d1 redis: lolwut parameter fix
Currently, lolwut with some parameters output broken square,
such as "lolwut 10 1 1":

127.0.0.1:6379> lolwut 10 1 1
⠀⡤⠤⠤⠤⠤⠤⠤⠤⠤
⠀⡇⠀⠀⠀⠀⠀⠀⠀⠀
⠀⡇⠀⠀⠀⠀⠀⠀⠀⠀
⠀⡇⠀⠀⠀⠀⠀⠀⠀⠀

It because we passes incorrect parameters on draw_schotter().
2020-04-13 10:46:45 +09:00
Takuya ASADA
b37ea9c27f redis-test: add lolwut test
Add test for lolwut command.
2020-04-13 10:46:45 +09:00
Calle Wilund
a14a28cdf4 gms::inet_address: Fix sign extension error in custom address formatting
Fixes #5808

Seems some gcc:s will generate the code as sign extending. Mine does not,
but this should be more correct anyhow.

Added small stringify test to serialization_test for inet_address
2020-04-12 17:48:44 +03:00
Avi Kivity
a4a5b77bd5 Merge 'Match Cassandra's null prohibitions' from Dejan
"
We currently allow null on the right-hand side of certain relations, while Cassandra prohibits it.  Since our handling of these null values is mostly incorrect, it's better to match Cassandra in prohibiting it.

See the discussion (https://github.com/scylladb/scylla/pull/5763#discussion_r405557323.

NB: any reverse mismatch (Scylla prohibiting something that Cassandra allows) is left remaining.  For example, we forbid null bounds on clustering columns, which Cassandra allows.

Tests: unit (dev)
"

* dekimir-match-cass-null:
  restrictions: Forbid null bound for nonkey columns
  restrictions: Forbid null equality
2020-04-12 17:44:31 +03:00
Nadav Har'El
4e2bf28b84 alternator-test: make Alternator tests runnable from test.py
To make the tests in alternator-test runnable by test.py, we need to
move the directory alternator-test/ to test/alternator, because test.py
only looks for tests in subdirectories of test/. Then, we need to create
a test/alternator/suite.yaml saying that this test directory is of type
"Run", i.e., it has a single run script "run" which runs all its tests.

The "run" script had to be slightly modified to be aware of its new
location relative to the source directory.

To run the Alternator tests from test.py, do:

	./test.py --mode dev alternator

Note that in this version, the "--mode" has no effect - test/alternator/run
always runs the latest compiled Scylla, regardless of the chosen mode.

The Alternator tests can still be run manually and individually against
a running Scylla or DynamoDB as before - just go to the test/alternator
directory (instead of alternator-test previously) and run "pytest" with
the desired parameters.

Fixes #6046

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2020-04-12 16:27:45 +03:00
Nadav Har'El
0cccb5a630 test.py: add xunit XML output file for "Run" tests
Assumes that "Run" tests can take the --junit-xml=<path> option, and
pass it to ask the test to generate an XML summary of the run to a file
like testlog/dev/xml/run.1.xunit.xml.

This option is honored by the Alternator tests.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2020-04-12 16:26:50 +03:00
Nadav Har'El
0ae3136900 test.py: add new test type "Run"
This patch adds a new test type, "Run". A test subdirectory of type "Run"
has a script called "run" which is expected to run all the tests in that
directory.

This will be used, in the next patch, by the Alternator functional tests.
These tests indeed have a "run" script, which runs Scylla and then runs
*all* of Alternator's tests, finishing fairly quickly (in less than a
minute). All of that will become one test.py test.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2020-04-12 16:26:50 +03:00
Nadav Har'El
36e44972f1 test.py: flag for aborting tests with SIGTERM, not SIGKILL
Today, if test.py is interrupted with SIGINT or SIGTERM, the ongoing test
is killed with SIGKILL. Some types of tests - such as Alternator's test -
may depend on being killed politely (e.g., with SIGTERM) to clean up
files.

We cannot yet change the signal to SIGTERM for all tests, because Seastar
tests often don't deal well with signals, but we can at least add a flag
that certain test types - that know they can be killed gently - will use.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2020-04-12 16:26:50 +03:00
Nadav Har'El
24fcc0c0ff alternator-test: change "run" script to pick random IP address
Before this patch, the Alternator tests "run" script ran Scylla on a fixed
listening address, 127.0.0.1. There is a problem that there might be other
concurrent runs of Scylla using the same IP address - e.g., CCM (used by
dtest) uses exactly this IP address for its first node.

Luckily, Linux's loopback device actually allows us to pick any of over
a million addresses in 127.0.0.0/8 to listen on - we don't need to use
127.0.0.1 specifically. So the code in this patch picks an address in
127.1.*.*, so it cannot collide with CCM (which uses 127.0.0.* for up to
255 nodes). Moreover, the last two bytes of the listen address are picked
based on the process ID of the run script; This allows multiple copies
of this script to run concurrently - in case anybody wishes to do that.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2020-04-12 16:26:31 +03:00
Nadav Har'El
1aec4baa51 alternator-test: add "--url" option to choose Alternator's URL
The "--aws" and "--local" test options chooses between two useful default
URLs - Amazon's, or http://localhost:8000 for a local installation.
However, sometimes one wants to run Scylla on a different IP address or
port, so in this patch we add a "--url" option to choose a specific URL to
connect to. For example, "--url http://127.1.2.3:1234".

We will later use this option in the alternator-test/run script, to pick
a random IP address on which to run Scylla, and then run the test against
this address.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
2020-04-12 16:25:04 +03:00
Pekka Enberg
c8247aced6 Revert "api: support table auto compaction control"
This reverts commit 1c444b7e1e. The test
it adds sometimes fails as follows:

  test/boost/sstable_datafile_test.cc(1076): fatal error: in "autocompaction_control_test":
  critical check cm->get_stats().pending_tasks == 1 || cm->get_stats().active_tasks == 1 has failed

Ivan is working on a fix, but let's revert this commit to avoid blocking
next promotion failing from time to time.
2020-04-11 17:56:02 +03:00
Takuya ASADA
679fb5887a redis: add exists command
Add exists command that returns key availablitiy.

see: https://redis.io/commands/exists
2020-04-11 12:45:54 +02:00
Israel Fruchter
e3d764bb58 dist/docker: make docker-entrypoint.py pass signals to supervisord
Stopping docker currectly didn't pass the signals to supervisord,
hence scylla wasn't gracefully shutdown.

Fixes #6150
2020-04-11 12:45:54 +02:00
Piotr Sarna
ea827d42b9 test: move config to heap in config_test
... in order to get rid of a large stack warning.
Tests: unit(dev)
Message-Id: <010517a6029a70de069d5952cc853f5724280eea.1586422630.git.sarna@scylladb.com>
2020-04-09 11:22:49 +02:00
Piotr Sarna
dea5bc41ff docs: add an entry about accessing Scylla system tables
A paragraph explaining how to access Scylla system tables
via alternator HTTP(S) interface is added.
2020-04-09 09:41:30 +02:00
Piotr Sarna
e4b1da4047 alternator-test: add scylla-only test for querying system tables
The first test case checks that system tables are readable via
Scan/Query requests.
The second test case checks that it's not possible to read user tables
by using the virtual interface.
The third test case checks that creating a table which looks like
an internal system table pattern (.scylla.alternator.KS_NAME.TABLE_NAME)
is not possible and returns a validation error.
2020-04-09 09:41:30 +02:00
Piotr Sarna
53bbef1e6c alternator: add a way of accessing system tables from alternator
Scylla's system tables often provide interesting information for
clients. In order to be able to access this information without CQL,
a notion of virtual tables is introduced to alternator.
When a table named .scylla.alternator.KS_NAME.TABLE_NAME is accessed
with read-only operation - Query or Scan, Scylla's internal
KS_NAME.TABLE_NAME table will be queried instead. For instance,
if a user wants to read about system_auth.roles, the Scan request
should target the following table: ".scylla.alternator.system_auth.roles".

Fixes #6122
2020-04-09 09:41:30 +02:00
Piotr Sarna
09d09ddefb alternator: add fetching static columns if they exist
Until now, the list of static column ids was always empty for alternator
tables anyway, so the list wasn't fetched. However, with the virtual
interface of fetching Scylla internal tables, we need to list the ids
of selected static columns explicitly to avoid segfaults - since we
select the whole row, static columns included.
2020-04-09 09:41:30 +02:00
Piotr Sarna
df02fc6b06 alternator: add fallback serialization for all types
While most types (e.g. boolean) are not valid key types for alternator users,
system tables derived from Scylla may still use this type for keys,
e.g. system_auth.roles. Note that types which are not directly
supported by alternator (e.g. double) will not be representable
out-of-the-box - instead, they simply fall back to string, which is both
human-readable and supported by alternator.
2020-04-09 09:41:30 +02:00
Dejan Mircevski
1ab04ac861 restrictions: Forbid null bound for nonkey columns
Cassandra prohibits null bounds for non-key columns.  Match that
prohibition.

Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
2020-04-08 16:35:47 -04:00
Ivan Prisyazhnyy
1c444b7e1e api: support table auto compaction control
This patch adds API endpoint /column_family/autocompaction/{name}
that listen to GET and POST requests to pick and control table
background compactions.

To implement that the patch introduces "_compaction_disabled_by_user"
flag that affects if CompactionManager is allowed to push background
compactions jobs into the work.

It introduces

    table::enable_auto_compaction();
    table::disable_auto_compaction();
    bool table::is_auto_compaction_disabled_by_user() const

to control auto compaction state.

Fixes #1488
Fixes #1808
Fixes #440
Tests: unit(sstable_datafile_test autocompaction_control_test), manual
2020-04-08 21:18:38 +03:00
Dejan Mircevski
4f262e31d2 restrictions: Forbid null equality
Cassandra prohibits `=null` for both column values and map values.
Match that prohibition.

Signed-off-by: Dejan Mircevski <dejan@scylladb.com>
2020-04-08 13:57:49 -04:00
Botond Dénes
aa9a582f4a cql3: functions/castas_fcts: allow self-casting any type
Casting a type to itself doesn't make sense, but it is harmless so allow
it instead of reporting a confusing error message that makes even less
sense:

    InvalidRequest: Error from server: code=2200 [Invalid query]
    message="org.apache.cassandra.db.marshal.BooleanType cannot be cast
    to org.apache.cassandra.db.marshal.BooleanType"

Note that some types already supported self-casting, this patch just
extends this to all types in a forward compatible way.

Fixes: #5102

Tests: unit(dev), manual test casting boolean to boolean.
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200408135041.854981-1-bdenes@scylladb.com>
2020-04-08 18:52:36 +03:00
Piotr Sarna
123edfc10c alternator: fix failure on incorrect table name with no indexes
If a table name is not found, it may still exist as a local index,
but the check tried to fetch a local index name regardless if it was
present in the request, which was a nullptr dereference bug.

Fixes #6161
Tests: alternator-test(local, remote)
Message-Id: <428c21e94f6c9e450b1766943677613bd46cbc68.1586347130.git.sarna@scylladb.com>
2020-04-08 15:33:48 +03:00
Botond Dénes
196dd5fa9b treewide: throw std::bad_function_call with backtraces
We typically use `std::bad_function_call` to throw from
mandatory-to-implement virtual functions, that cannot have a meaningful
implementation in the derived class. The problem with
`std::bad_function_call` is that it carries absolutely no information
w.r.t. where was it thrown from.

I originally wanted to replace `std::bad_function_call` in our codebase
with a custom exception type that would allow passing in the name of the
function it is thrown from to be included in the exception message.
However after I ended up also including a backtrace, Benny Halevy
pointed out that I might as well just throw `std:bad_function_call` with
a backtrace instead. So this is what this patch does.

All users are various unimplemented methods of the
`flat_mutation_reader::impl` interface.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200408075801.701416-1-bdenes@scylladb.com>
2020-04-08 13:54:06 +02:00
Avi Kivity
a490cb669b Update seastar submodule
* seastar fd9af3a26...cce2ddac1 (6):
  > rpc: fix build failures in C++14 mode due to std::string_view
  > util/backtrace: introduce make_backtraced_exception_ptr()
  > future: make do_for_each noexcept
  > fair_queue rename the fair_queue_descriptor and change its default init
  > future: do_with: make noexcept
  > io_queue: batch communication with the fair_queue for ready requests
2020-04-08 13:54:06 +02:00
Botond Dénes
f0530c7d41 configure.py: add {mode}-test, {mode}-check, test and check targets
The test target builds all tests and runs them. The check target
compiles all the headers in addition to this. The {mode} variants do
these just for the respective mode.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200407132641.598412-1-bdenes@scylladb.com>
Reviewed-by: Pekka Enberg <penberg@scylladb.com>
2020-04-08 13:54:06 +02:00
Calle Wilund
65a6ebbd73 cdc: Postimage must check iff we have (pre-)image row data for non-touched columns
Fixes #6143

When doing post-image generation, we also write values for columns not
in delta (actual update), based on data selected in pre-image row.

However, if we are doing initial update/insert with only a subset of
columns, when the pre-image result set is nil, this cannot be done.

Adds check to non-touched column post-image code. Also uses the
pre-image value extractor to handle non-atomic sets properly.

Tests updated.
2020-04-08 13:48:54 +02:00
Tomasz Grabiec
55240e9db2 Merge "Fix open-ended tombstone issues in alternator" from Piotr Sarna
This miniseries provides workarounds for open-ended range tombstones
reportedly appearing in alternator tables. The issue was that
row tombstones created for tables without clustering keys look
like open-ended range tombstones, which confuses the LA/KA format
writer.

Tests: alternator-test(local)

Fixes #6035
Refs #6157
2020-04-08 13:43:40 +02:00
Pavel Solodovnikov
3206c1bf66 paxos_state: introduce error injections for testing timeouts in paxos stages
The following sleep injections are added to paxos_state:
 * paxos_state_prepare_timeout (timeouts in paxos_state::prepare)
 * paxos_state_accept_timeout (timeouts in paxos_state::accept)
 * paxos_state_learn_timeout (timeouts in paxos_state::learn)

Tests: unit ({dev}), unit ({debug})

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200403092107.181057-1-alejo.sanchez@scylladb.com>
2020-04-08 10:47:15 +02:00
Piotr Sarna
a4da07f8b3 alternator-test: mark identical gsi test as skipped
Creating an index on a table with only the partition key
can lead to open-ended range tombstones appearing,
if the indexed column is also the very same partition key -
which is quite a useless case, but it's allowed both by alternator
and DynamoDB. In order to make the tests pass when KA/LA sstables
are used, this test case is hereby skipped until further notice.

Refs #6157
2020-04-08 08:11:39 +02:00
Piotr Sarna
0a2d7addc0 alternator: use partition tombstone if there's no clustering key
As @tgrabiec helpfully pointed out, creating a row tombstone
for a table which does not have a clustering key in its schema
creates something that looks like an open-ended range tombstone.
That's problematic for KA/LA sstable formats, which are incapable
of writing such tombstones, so a workaround is provided
in order to allow using KA/LA in alternator.

Fixes #6035
2020-04-08 08:08:45 +02:00
Glauber Costa
54a0928a85 systemd: disable start timeout
I am about to change resharding to block the start of the node. Being a
somewhat slow operation, the timeout of 900 sec is guaranteed to trigger
in large nodes with lots of data.

This patch effectively disables the start timeout, while keeping the
stop timeout unchanged.

My preference would have been to use a timeout extension mechanism
during resharding. Systemd actually has such mechanism, where we can
send a message through sd_notify asking the timeout to be extended.
However such mechanism is not present in SystemD v219, used by RHEL7.
That means for RHEL7 we need a different way to deal with the timeout
anyway.

The second preference is also obviously to write "infinity" as the
timeout value. But guess what? SystemD v219 also has a bug in which
infinity is interepreted as zero
(https://bugzilla.redhat.com/show_bug.cgi?id=1446015)

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200407155754.10020-1-glauber@scylladb.com>
2020-04-08 08:14:35 +03:00
Botond Dénes
e17d8af3c6 compound_compat: composite::iterator cover error-paths with on_internal_error()
But only non-validation error paths. When validating we do expect it to
maybe fail, so we don't want to generate cores for validation.
Validation is in fact a de-serialization pass with some additional
checks. To be able to keep reusing the same code for de-serialization
and validation just with different error handling, introduce a
`strict_mode` flag that can be passed to `composite::iterator`
constructor. When in strict mode (the default) the iterator will convert
any `marshal_exception` thrown during the de-serialization to
`on_internal_error()`.

We don't want anybody to use the iterator in non-strict mode, besides
validation, so the iterator constructors are made private. This is
standard practice for iterators anyway.
2020-04-07 13:18:03 +03:00
Botond Dénes
16246d1c99 frozen_schema: make freezing constructor explicit
Freezing is an expensive operation, that involves serializing the entire
mutation. Having an implicit freezing constructor means this can happen
as part of an implicit type conversion without the programmer even
noticing, even when this is not really necessary.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200407080245.234021-1-bdenes@scylladb.com>
2020-04-07 12:00:36 +03:00
Botond Dénes
e0e9b6d9b0 compound_compat: composite_view: add is_valid()
Until now this was open-coded in `sstables::validate_min_max_metadata()`.
We want to cover non-validation compound de-serialization error-paths
with `on_internal_error()` and so we need more control over how
compounds are validated. As a first step we want to centralize
validation in the class itself as in the next patches they will use
private APIs to bypass `on_internal_error()` in the error paths during
validation.
2020-04-07 11:45:45 +03:00
Benny Halevy
89b3974e56 sstables: print invalid boundary type as unsigned int
Otherwise it prints a binary value to the log and corrupting it.
Seen when testing scrub with randomly-corrupted sstable
using scrub_with_one_node_expect_data_loss_test
as of https://github.com/scylladb/scylla-dtest/pull/1414

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200407055617.1045977-1-bhalevy@scylladb.com>
2020-04-07 10:18:19 +02:00
Benny Halevy
a20c85713b storage_proxy: paxos_response_handler::prune: fixup indentation
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200405115046.733450-2-bhalevy@scylladb.com>
2020-04-07 08:47:38 +03:00
Benny Halevy
4e37aee8a2 storage_proxy: paxos_response_handler::prune: no need for futurize_apply
parallel_for_each already futurize_invoke's the lambda passed to it
since seastar commit c5e158e5f173e25a62308997a3da4348053b2a0f

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Message-Id: <20200405115046.733450-1-bhalevy@scylladb.com>
2020-04-07 08:47:38 +03:00
Raphael S. Carvalho
044f80b1b5 cql3: don't reset default TTL when not explicitly specified in alter table statement
Any alter table statement that doesn't explicitly set the default time
to live will reset it to 0.

That can be very dangerous for time series use cases, which rely on
all data being eventually expired, and a default TTL of 0 means
data never being expired.

Fixes #5048.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20200402211653.25603-1-raphaelsc@scylladb.com>
2020-04-07 08:47:38 +03:00
Avi Kivity
0bc90756db tools: toolchain: add note explaining how to use podman to build images
podman is compatible with docker, but by default emits a manifest
format that is not understood by old docker clients, so give it
an extra flag to generate the old format instead.

Message-Id: <20200406134526.21521-1-avi@scylladb.com>
2020-04-07 08:47:38 +03:00
Glauber Costa
80f414ed6e sstables: restore ident
Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200401162722.28780-3-glauber@scylladb.com>
2020-04-06 16:02:31 +03:00
Glauber Costa
463d0ab37c compaction: move rewrite_sstables to the compaction_manager
There is no reason why the table code has to be aware of the efforts of
rewriting (cleanup, scrub, upgrade) an SSTable versus compacting it.

Rewrite is special, because we need to do it one SSTable at a time,
without lumping it together. However, the compaction manager is totally
capable of doing that itself. If we do that, the special
"table::rewrite_sstables" can be killed.

This code would maybe be better off as a thread, where we wouldn't need
to keep state. However there are some methods like maybe_stop_on_error()
that expect a future so I am leaving this be for now. This is a cleanup
that can be done later.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200401162722.28780-2-glauber@scylladb.com>
2020-04-06 16:02:30 +03:00
Nadav Har'El
ac43a9e2aa merge: Fix generating base keys from empty indexing paging state
Merged pull request https://github.com/scylladb/scylla/pull/6136 from
Piotr Sarna:

An empty partition/clustering key pair is a valid state of the
query paging state. Unfortunately, recent attempts at debugging
a flaky test (#5856) resulted in introducing an assertion (7616290)
which breaks when trying to generate a key from such a pair.
In order to keep the assertion (since it still makes sense in its
scope), but at the same time translate empty keys properly,
empty keys are now explicitly processed at the beginning of the
function.
This behaviour was 100% reproducible in a secondary index dtest below.

Fixes #6134
Refs #5856
Tests: unit(dev),
dtest(TestSecondaryIndexes.test_truncate_base)
2020-04-06 15:23:39 +03:00
Takuya ASADA
3ce6cdc6d8 install.sh: suppoprt --upgrade
To use install.sh as Scylla install script w/o using .rpm/.deb package,
we need to provide a way to upgrade Scylla version, not just install.

With --upgrade option, install.sh does not overwrite config files.
It will install <filename>.new file on same directory, when old config file and
new config file does not contain same data.
If old one and new one is exactly same, it will nothing.

To implement this, rewriting api_ui_dir/api_doc_dir path on scylla.yaml
moved from .rpm/.deb scriptlet to install.sh.

Fixes #5874
2020-04-06 15:07:28 +03:00
Takuya ASADA
5f18964763 dist/common/scripts/scylla_coredump_setup: bind-mount coredump directory, add coredump test
On some environment systemd-coredump does not work with symlink directory,
we can use bind-mount instead.
Also, it's better to check systemd-coredump is working by generating coredump.

To fix #5916, drop scylla_coredump_setup from .rpm %post scriptlet.

Fixes #5753
Fixes #5916
2020-04-06 15:03:11 +03:00
Avi Kivity
e9e2b75a76 Merge "Allow Major compactions for TWCS" from Glauber
"
This patch makes makes major compaction aware of time buckets
for TWCS. That means that calling a major compaction with TWCS
will not bundle all SSTables together, but rather split them
based on their timestamps.

There are two motivations for this work:

Telling users not to ever major compact is easier said than
done: in practice due to a variety of circumstances it might
end up being done in which case data will have a hard time
expiring later.

We are about to start working with offstrategy compactions,
which are compactions that work in parallel with the main
compactions. In those cases we may be converting SSTables from
one format to another and it might be necessary to split a single
big STCS SSTable into something that TWCS expects

In order to achieve that, we start by changing the way resharding works:
it will now work with a read interposer, similar to the one TWCS uses for
streaming data. Once we do that, a lot of assumptions that exist in the
compaction code can be simplified and supporting TWCS major
compactions become a matter of simply enabling its interposer in the
compaction code as well.

There are many further simplifications that this work exposes:

The compaction method create_new_sstable seems out of place. It is not
used by resharding, and it seems duplicated for normal compactions. We
could clean it up with more refactoring in a later patch.
The whole logic of the feed_writer could be part of the consumer code.
Testing details:

scylla unit tests (dev, release)
sstable_datafile_test (debug)
dtests (resharding_test.py)
manual scylla resharding

Fixes #1431
"

Reviewed-by: Raphael S. Carvalho <raphaelsc@scylladb.com>

* 'twcs-major-v3' of github.com:glommer/scylla:
  compaction: make major compaction time-aware with TWCS
  compaction: do resharding through an interposer
  mutation_writer: introduce shard_based splitting writer
  mutation_writer: factor out part of the code for the timestamp splitter
  compaction: abort if create_new_sstable is called from resharding
2020-04-06 12:54:08 +03:00
Gleb Natapov
e5f7ccc4c8 lwt: fix possible leak of "prune" counter
If get_schema_for_read() fails "prune" counter will not be decremented.
The patch fixes it by creating RAI object earlier. Also return releasing
of a mutation in release_mutation() which was dropped by mistake.

Fixes #6124

Message-Id: <20200405080233.GA22509@scylladb.com>
2020-04-06 11:30:38 +02:00
Nadav Har'El
d9d50362af alternator: remove mentions of experimental status of LWT
Since commit 9948f548a5, the LWT no longer
requires an "experimental" flag, so Alternator documents and scripts
which referred to the need for enabling experimental LWT, are fixed here
to no longer do that.

Fixes #6118.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200405143237.12693-1-nyh@scylladb.com>
2020-04-06 12:12:08 +03:00
Piotr Sarna
8fea5075f2 test: fix manual gossip test
When trying to get rid of a large stack warning for gossip test,
I found out that it actually does not run at all for multiple reasons:
1. It segfaults due to wrong initialization order
2. After fixing that, it segfaults on use-after-free (due to capturing
   a shared pointer by reference instead of by copy)
3. After that, cleanups are in order:
    * seastar thread does not need to be spawned inside another thread;
    * default captures are harmful, so they're made explicit instead;
    * db::config is moved to heap, to finally get rid of the warning.

Tests: manual(gossip)
Message-Id: <feaca415d0d29a16c541f9987645365310663630.1585128338.git.sarna@scylladb.com>
2020-04-06 11:07:10 +02:00
Piotr Sarna
88913e9d44 test: add cases for empty paging state for index queries
In order to check regressions related to #6136 and similar issues,
test cases for handling paging state with empty partition/clustering
key pair are added.
2020-04-06 08:59:40 +02:00
Piotr Sarna
45751ee24f cql3: fix generating base keys from empty index paging state
An empty partition/clustering key pair is a valid state of the
query paging state. Unfortunately, recent attempts at debugging
a flaky test resulted in introducing an assertion which breaks
when trying to generate a key from such a pair.
In order to keep the assertion (since it still makes sense in its
scope), but at the same time translate empty keys properly,
empty keys are now explicitly processed at the beginning of the
function.
This behaviour was 100% reproducible in a secondary index dtest below.

Fixes #6134
Refs #5856
Tests: unit(dev),
       dtest(TestSecondaryIndexes.test_truncate_base)
2020-04-06 07:49:06 +02:00
Avi Kivity
4e6f543676 tools: toolchain: use "docker build --pull" in instructions for building an image
Specify --pull in order to refresh the base image (some Fedora release).
Usually this is not important, because we run `dnf update`. But if the
cached image happens to be a pre-release version of Fedora, the image
will have the update-testing repository enabled, and we may get some
unwanted updates.

It's sad that we need two separate flags for correctness (the other
is --no-cache.
Message-Id: <20200405164227.8210-1-avi@scylladb.com>
2020-04-05 19:48:25 +03:00
Piotr Sarna
0bb211a65f alternator: defuse a serialization path time bomb
The default serialization path for items was subtly broken -
instead of parsing JSON string representation of objects,
it tried to parse a regular string implementation - which is often
also a valid JSON, but nothing guarantees that it actually is.

Tests: alternator-test(local)

Message-Id: <e1668bf4e9029f2675a4ac28bb4598714575efeb.1586096732.git.sarna@scylladb.com>
2020-04-05 18:55:54 +03:00
Nadav Har'El
c1a7a071ea merge: Remove most inclusions of reactor.hh
Merged patch series from Avi Kivity:

This patchset removes most inclusions of reactor.hh, by switching
to new namespace-scoped API:s instead of those using engine()
as a way to get the reactor. With this, we are down to 12 translation
units depending on reactor.hh, mostly for deprecated API:s like
reactor::at_exit().

Avi Kivity (3):
  logalloc: use namespace-scope seastar::idle_cpu_handler and related
    rather than reactor scope
  test: sstable-utils: deinline do_make_keys()
  treewide: replace calls to engine().some_api() with some_api()

 configure.py                                  | 14 +++-----
 auth/common.hh                                |  3 +-
 checked-file-impl.hh                          |  4 +--
 db/system_keyspace_view_types.hh              |  2 +-
 flat_mutation_reader.hh                       |  1 +
 lister.hh                                     |  2 +-
 message/messaging_service.hh                  |  2 +-
 redis/server.hh                               |  2 +-
 sstables/compress.hh                          |  2 +-
 sstables/integrity_checked_file_impl.hh       |  2 +-
 test/lib/sstable_utils.hh                     | 35 ++++---------------
 test/lib/test_services.hh                     |  2 +-
 thrift/server.hh                              |  2 +-
 transport/server.hh                           |  2 +-
 utils/error_injection.hh                      |  3 +-
 utils/joinpoint.hh                            |  2 +-
 utils/loading_cache.hh                        |  2 +-
 utils/logalloc.hh                             |  6 ++--
 utils/rate_limiter.hh                         |  2 +-
 api/system.cc                                 |  1 +
 auth/default_authorizer.cc                    |  2 +-
 auth/password_authenticator.cc                |  2 +-
 database.cc                                   |  1 +
 db/commitlog/commitlog.cc                     |  4 +--
 db/hints/resource_manager.cc                  |  3 +-
 db/system_distributed_keyspace.cc             |  2 +-
 dht/i_partitioner.cc                          |  2 +-
 gms/feature_service.cc                        |  3 +-
 lister.cc                                     |  4 +--
 locator/ec2_snitch.cc                         |  3 +-
 locator/gce_snitch.cc                         |  1 +
 main.cc                                       |  1 +
 reader_concurrency_semaphore.cc               |  2 +-
 redis/server.cc                               |  4 +--
 sstables/sstables.cc                          | 11 +++---
 table.cc                                      |  3 +-
 test/boost/commitlog_test.cc                  |  2 +-
 test/boost/database_test.cc                   |  2 +-
 test/boost/flush_queue_test.cc                |  2 +-
 test/boost/gossip_test.cc                     |  2 +-
 .../gossiping_property_file_snitch_test.cc    |  1 +
 test/boost/loading_cache_test.cc              |  2 +-
 test/boost/sstable_3_x_test.cc                |  1 +
 test/boost/sstable_datafile_test.cc           |  1 +
 test/boost/sstable_test.cc                    |  1 +
 test/lib/sstable_utils.cc                     | 26 ++++++++++++++
 test/manual/gossip.cc                         |  2 +-
 test/manual/hint_test.cc                      |  2 +-
 test/manual/sstable_scan_footprint_test.cc    |  2 +-
 test/perf/perf_mutation.cc                    |  1 +
 test/perf/perf_row_cache_update.cc            |  1 +
 test/perf/perf_sstable.cc                     |  1 +
 test/tools/cql_repl.cc                        |  2 +-
 thrift/server.cc                              |  2 +-
 transport/server.cc                           |  4 +--
 utils/config_file.cc                          |  3 +-
 utils/file_lock.cc                            |  2 +-
 utils/logalloc.cc                             | 14 ++++----
 utils/updateable_value.cc                     |  2 +-
 59 files changed, 119 insertions(+), 98 deletions(-)
2020-04-05 13:47:39 +03:00
Nadav Har'El
dcfdd917e1 merge: Guard against potential races in view builder
Merge patch series from Piotr Sarna:

This series adds extra precautions against potential races
in view building. In particular, it was based on the following scenario:

1. View builder detects that a view V is no longer here, so it schedules
   removing its info from bookkeeping, without any semaphores,
   and this continuation gets preempted immediately.
2. A view is deleted and recreated with the same name - V.
3. View V building is finished.
4. The continuation from (1.) is finally executed, and it removes old view V
   info from bookkeeping - which is a problem, since view building
   bookkeeping is based on *names*, not *uuids* - consequently,
   the new view bookkeeping info is erroneously removed.

The issue is solved by putting startup code (which also does cleanup
from point (1.)) under the same semaphore as other bookkeeping
operations. With that, it will be impossible to execute step (2.)
before (1.) ends, which effectively prevents the race.

Refs #6094 (possible fixes it too, but since I could not reproduce
            the issue...)

Tests: unit(dev)

Piotr Sarna (4):
  db,view: fix waiting for a view building future
  db,view: remove unneeded implicit capture-by-reference
  db,view: nitpick: change & operator to && for booleans
  db,view: guard view builder startup with a semaphore

 db/view/view.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
2020-04-05 13:19:23 +03:00
Avi Kivity
88ade3110f treewide: replace calls to engine().some_api() with some_api()
This removes the need to include reactor.hh, a source of compile
time bloat.

In some places, the call is qualified with seastar:: in order
to resolve ambiguities with a local name.

Includes are adjusted to make everything compile. We end up
having 14 translation units including reactor.hh, primarily for
deprecated things like reactor::at_exit().

Ref #1
2020-04-05 12:46:04 +03:00
Avi Kivity
5e32ecb514 test: sstable-utils: deinline do_make_keys()
This hides a call to engine_is_ready() which is only available in
reactor.hh.

Dependencies are adjusted so tests link.

Ref #1.
2020-04-05 12:46:04 +03:00
Avi Kivity
1799cfa88a logalloc: use namespace-scope seastar::idle_cpu_handler and related rather than reactor scope
This allows us to drop a #include <reactor.hh>, reducing compile time.

Several translation units that lost access to required declarations
are updated with the required includes (this can be an include of
reactor.hh itself, in case the translation unit that lost it got it
indirectly via logalloc.hh)

Ref #1.
2020-04-05 12:45:08 +03:00
Piotr Sarna
1a9083b342 db,view: guard view builder startup with a semaphore
The startup routine performs some bookkeeping operations on views,
and so do these events:
 - on_create_view;
 - on_drop_view;
 - on_update_view.
Since the above events are guarded with a semaphore, the startup
routine should also take the same semaphore - in order to ensure
that all bookkeeping operations are serialized.

Refs #6094
2020-04-05 11:41:26 +02:00
Piotr Sarna
8da4a5b78c db,view: nitpick: change & operator to && for booleans
Although it's technically correct to use the bitwise and operator
on booleans as well, it's slightly confusing for the reader.
2020-04-05 11:41:25 +02:00
Piotr Sarna
e49805b7b8 db,view: remove unneeded implicit capture-by-reference
The lambda does not use any other captures, so it does not to
implicitly capture anything by reference.
2020-04-05 11:41:25 +02:00
Piotr Sarna
3f19865493 db,view: fix waiting for a view building future
The future was marked with a `FIXME: discarded future`, but there's really
no reason not to wait for it, and it was probably meant to be waited for
since its implementation.
2020-04-05 11:41:25 +02:00
Piotr Sarna
76969ea619 test: move config to heap in gossip_test
... in order to get rid of a large stack warning.
Tests: unit(dev)

Message-Id: <da4349b89554265ec419544b63ce084eab25ac0f.1586068467.git.sarna@scylladb.com>
2020-04-05 10:18:14 +03:00
Rafael Ávila de Espíndola
c59a307f17 table_helper: Use CanInvoke instead of CanApply
The CanApply predicate is deprecated.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200403225907.7910-1-espindola@scylladb.com>
2020-04-05 08:36:29 +02:00
Tomasz Grabiec
df48b5ec9d gossip: Fix a confusing parameter name
Message-Id: <1585940635-1194-1-git-send-email-tgrabiec@scylladb.com>
2020-04-05 08:24:51 +02:00
Piotr Jastrzebski
a15b32c9d9 token: relax the condition of the sanity check
When we switched token representation to int64_t
we added some sanity checks that byte representation
is always 8 bytes long.

It turns out that for token_kind::before_all_keys and
token_kind::after_all_keys bytes can sometimes be empty
because for those tokens they are just ignored. The check
introduced with the change is too strict and sometimes
throws the exception for tokens before/after all keys
created with empty bytes.

This patch relaxes the condition of the check and always
uses 0 as value of _data for special before/after all keys
tokens.

Fixes #6131

Tests: unit(dev, sct)

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-04-04 15:50:10 +03:00
Rafael Ávila de Espíndola
4db4237310 configure: Delete dead options
These options are not used anywhere.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200403173458.119939-1-espindola@scylladb.com>
2020-04-04 14:52:24 +03:00
Rafael Ávila de Espíndola
a10bdb17b3 user_function_test: Test UDF without the corresponding experimental flag
The existing test was not using the db::config it was creating. Use it
and test the produced exception.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200403170235.113558-2-espindola@scylladb.com>
2020-04-03 20:00:24 +02:00
Rafael Ávila de Espíndola
3f3634ece1 test: Use feature_config_from_db_config to setup feature_config
This reduces code duplication and uses the same code path that is used
in scylla itself.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200403170235.113558-1-espindola@scylladb.com>
2020-04-03 19:59:00 +02:00
Tomasz Grabiec
4578031bd6 Update seastar submodule
* seastar 41c83ec...fd9af3a (7):
  > stall_detector: Delete unused member variable
  > future: Avoid a move in finally_body
  > Merge "Followup cleanups for the apply/invoke split" from Rafael
  > Merge "make trivial future related functions noexcept" from Benny
  > rpc_test: silence depreceted lambda logger warning
  > rpc_demo: stop using variadic futures
  > future: Move two static_asserts to the top
2020-04-03 19:48:00 +02:00
Botond Dénes
9e1d6ada0f types: compare(): cover more paths with on_internal_error()
Currently we call `on_internal_error()` if `tri_compare()` throws
`marshal_exception`. Some compare paths however might go around
`tri_compare()` and call `abstract_type::compare()` directly. Move the
check there to cover these cases too.

Tests: dev
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200403162530.1175801-1-bdenes@scylladb.com>
2020-04-03 18:35:30 +02:00
Rafael Ávila de Espíndola
8d0e40e37b service: Replace engine().cpu_id() with this_shard_id()
Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200403160915.59481-1-espindola@scylladb.com>
2020-04-03 18:18:25 +02:00
Rafael Ávila de Espíndola
891f3f44ee tombstone: Move can_gc_fn to a .cc
This reduces the total size reported by

$ find . -name *.hh.o | xargs du -bc

by 1.3%, from 49911928 to 49249680 bytes.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200403153241.34400-1-espindola@scylladb.com>
2020-04-03 18:17:31 +02:00
Glauber Costa
098b215b0d compaction: make major compaction time-aware with TWCS
This patch makes makes major compaction aware of time buckets
for TWCS. That means that calling a major compaction with TWCS
will not bundle all SSTables together, but rather split them
based on their timestamps.

There are two motivations for this work:
1. Telling users not to ever major compact is easier said than
   done: in practice due to a variety of circumstances it might
   end up being done in which case data will have a hard time
   expiring later.

2. We are about to start working with offstrategy compactions,
   which are compactions that work in parallel with the main
   compactions. In those cases we may be converting SSTables from
   one format to another and it might be necessary to split a single
   big STCS SSTable into something that TWCS expects

With the motivation out of the way, let's talk about the implementation:
The implementation is quite simple and builds upon the previous patches.
It simply specializes the interposer implementation for regular compaction
with a table-specific interposer.

Fixes #1431

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-03 10:10:10 -04:00
Glauber Costa
55a8b6e3c9 compaction: do resharding through an interposer
Our resharding code is complex, since the compaction object has to keep
track of many output SSTables, the current shard being written.

When implementing TWCS streaming writers, we ran away from such
write-side complexity by implementing an interposer: the interposer
consumes the flat_mutation_reader stream, creating many different writer
streams. We can do a similar thing for resharding SSTables and have each
writer be guaranteed to contain keys for only a specific source shard.

As we do that, we can move the SSTable and sstable_writer information
to the compacting_sstable_writer object. The compaction object will no
longer be responsible for it and can be simplified, paving the way for
TWCS-major, which will go through an interposer as well.

Note that the compaction_writer, which now holds both the SSTable
pointer and the sstable_writer still needs to be optional. This is
because LCS (and potentially others) still want to create more than
one SSTable per source stream. That is done to guarantee that each
SSTable complies with the max_sstable_size parameter, which is
information available in the sstable_writer that is not present at
the level of the flat_mutation_reader. We want to keep it in the writer
side.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-03 10:10:10 -04:00
Pavel Emelyanov
86296ba557 main: Do not destroy token_metadata
The storage_proxy instances hold references to token_metadata ones and
leave unwaited futures continuing to its query_partition_key_range_concurrent
method.

The latter is called from do_query so it's not that easy to find
out who is leaking. Keep the tokens not freed for a while.

Fixes: #6093
Test: manual start-stop

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
Message-Id: <20200402183538.9674-1-xemul@scylladb.com>
2020-04-03 16:00:08 +02:00
Rafael Ávila de Espíndola
8da235e440 everywhere: Use futurize_invoke instead of futurize<T>::invoke
No functionality change, just simpler.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200330165308.52383-1-espindola@scylladb.com>
2020-04-03 15:53:35 +02:00
Gleb Natapov
36a24bbb70 storage_proxy: limit read repair only to replicas that answered during speculative reads
Speculative reader has more targets that needed for CL. In case there is
a digest mismatch the repair runs between all of them, but that violates
provided CL. The patch makes it so that repair runs only between
replicas that answered (there will be CL of them).

Fixes #6123

Reviewed-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200402132245.GA21956@scylladb.com>
2020-04-02 17:32:08 +03:00
Avi Kivity
a6156a9caf build: make headers check compatible with distcc
distcc doesn't like the -x c++ flag, so create an empty.cc file for
this purpose and compile it.

Also drop the "=" from "--include=", which is also disliked by
distcc.
Message-Id: <20200402124312.48963-1-avi@scylladb.com>
2020-04-02 16:39:30 +03:00
Glauber Costa
8fe10863f4 mutation_writer: introduce shard_based splitting writer
This is similar to the timestamp based splitting writer, except
that it splits data based on the shard where the partition key
is supposed to be placed.

It is similar to the multishard_writer, in the sense that it
creates n streams for n shards, but it does not want to process
the streams in the owner shards. We want to use that in processes
like resharding where it is fine for a foreign shard to deal
with a mutation.

One option would be to augment the multishard_writer to optionally
achieve these properties, but having a separate splitter is both
simpler and faster.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-02 08:55:16 -04:00
Glauber Costa
a258f111c7 mutation_writer: factor out part of the code for the timestamp splitter
I am about to introduce a new splitter. Therefore, move parts of it
that are common to its own file.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-02 08:55:16 -04:00
Glauber Costa
a2d7a9c230 compaction: abort if create_new_sstable is called from resharding
I am about to get rid of the _shard attribute in the compaction object,
as I will create different streams of writers for different shards.

In preparation for that, remove the arbitrary _shard reference. Raphael
confirms that resharding should never be calling this, as this method is
used exclusively for garbage collection component of run-based
compaction. Therefore we'll just throw in this case and remove the shard
reference.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-02 08:55:16 -04:00
Glauber Costa
375cb8a32b compaction: pass current shard to sstable creation function
The shard parameter is ignored for SSTable creation on regular
compaction. It is still good practice and good future proofing
to pass something meaningful here instead of zero. This patch
passes the id of the current shard.

Thanks Botond for pointing that out.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200402122212.12218-1-glauber@scylladb.com>
2020-04-02 14:43:35 +02:00
Botond Dénes
240b5e0594 frozen_schema: key() remove unused schema parameter
Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200402092249.680210-1-bdenes@scylladb.com>
2020-04-02 14:43:35 +02:00
Pekka Enberg
75b55cea88 Merge "Resharding through compact sstables" from Glauber
"
This patchseries is part of my effort to make resharding less special -
and hopefully less problematic.  The next steps are a bit heavy, so I'd
like to, if possible, get this out of the way.

After these two patches, there is no more need to ever call
reshard_sstables: compact_sstables will do, and it will be able to
recognize resharding compactions.

To do that we need to unify the creator function, which is trivially
done by adding a shard parameter to regular compactions as well: they
can just ignore it. I have considered just making the
compaction_descriptor have a virtual create() function and specializing
it, but because we have to store the creator in the compaction object I
decided to keep the virtual function for now.

In a later cleanup step, if we can for instance store the entire
compaction_descriptor object in the compaction object we could do that.

Reviewed-by: Benny Halevy <bhalevy@scylladb.com>
Reviewed-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Reviewed-by: Botond Dénes <bdenes@scylladb.com>
Tests: unit tests (dev), dtest (resharding.py)
"

* 'resharding-through-compact-sstables' of github.com:glommer/scylla:
  resharding: get rid of special reshard_sstables
  compaction: enhance compaction_descriptor with creator and replace function
2020-04-02 14:43:35 +02:00
Pekka Enberg
43b488a7bc Revert "schema: Default dc_local_read_repair_chance to zero"
This reverts commit fdd2d9de3d because it
breaks one heat-weighted load balancing dtest:

FAIL: heat_weighted_load_balancing_cl_QUORUM_test (heat_weighted_load_balancing_test.HeatWeightedLB)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/penberg/src/scylla/scylla-dtest/heat_weighted_load_balancing_test.py", line 182, in heat_weighted_load_balancing_cl_QUORUM_test
    self.run_heat_weighted_load_balancing('QUORUM')
  File "/home/penberg/src/scylla/scylla-dtest/heat_weighted_load_balancing_test.py", line 165, in run_heat_weighted_load_balancing
    self.verify_metrics(metrics, cached=False)
  File "/home/penberg/src/scylla/scylla-dtest/heat_weighted_load_balancing_test.py", line 73, in verify_metrics
    mean_avg, node_mean_avg, key))
AssertionError: 19.0 not found in range(3, 13) : Cache difference between nodes is less then expected: 6469.6/328.2, metric scylla_storage_proxy_coordinator_reads_local_node

I am reverting because it's a test issue, and we should bring this
commit back once the test is fixed.

Gleb Natapov explains:

"dtest result directly depends on replicas we contact. Glauber's patch
make us contacts less replicas, so numbers differ."
2020-04-02 13:43:29 +03:00
Nadav Har'El
55f02c00f2 alternator-test: run: use the Python driver, not cqlsh
The "run" script for the Alternator tests needs to set a system table for
authentication credentials, so we can test this feature.
So far we did this with cqlsh, but cqlsh isn't always installed on build
machines. But install-dependencies.sh already installs the Cassandra driver
for Python, so it makes more sense to use that, so this patch switches to
use it.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200331131522.28056-1-nyh@scylladb.com>
2020-04-02 13:43:29 +03:00
Nadav Har'El
8627ae42a6 install-dependencies.sh: add dependencies for Alternator tests
To run Alternator tests, only two additional dependencies need to be added to
install-dependencies.sh: pytest, and python3-boto3. We also need
python3-cassandra-driver, but this dependency is already listed.

This patch only updates the dependencies for Fedora, which is what we need
for dbuild and our Jenkins setups.

Tested by building a new dbuild docker image and verifying that the
Alternator tests pass.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
[avi: update toolchain image; note this upgrades gcc to 9.3.1]
Message-Id: <20200330181128.18582-1-nyh@scylladb.com>
2020-04-02 13:43:16 +03:00
Piotr Sarna
b3fdb742ae cql3,index: add panic checks to base key generation
In order to be extra sure that we always generate proper
base partition/clustering keys from paging info when executing
an indexed query, additional checks are added - if any of them
triggers, an exception will be thrown.
Created in order to help debug an existing issue:
Refs #5856

Tests: unit(dev)
2020-04-01 18:27:07 +03:00
Gleb Natapov
4d9d226596 lwt: fix cas_now_pruning counter
Due to c&p error cas_now_pruning counter is increased instead of
decreased after an operation completes. Fix it.

Fixes #6116

Message-Id: <20200401142859.GA16953@scylladb.com>
2020-04-01 17:18:33 +02:00
Alejo Sanchez
3a4dd0a856 utils: error injection inject() returning a future
Make inject() return a future.

Suggested by Gleb.
Botond helped on dealing with complex function/lambda overload.

Refs #3295 (closed)

Tests: unit ({dev})

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200331143839.1781424-7-alejo.sanchez@scylladb.com>
2020-04-01 16:22:52 +02:00
Alejo Sanchez
8bae38cef9 utils: error injection support multiple clocks
Use template to support multiple clock classes for time point
for deadline injection.

Refs: #3295   (closed)

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200331143839.1781424-6-alejo.sanchez@scylladb.com>
2020-04-01 16:22:45 +02:00
Alejo Sanchez
71f2f423bc utils: error injection reorder args for exceptions
Move exception factory to end of argument list.

Refs: #3295   (closed)

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200331143839.1781424-5-alejo.sanchez@scylladb.com>
2020-04-01 16:22:38 +02:00
Alejo Sanchez
fd1eb6a466 utils: error injection simplify API
Split error injection C++ API to have

1. sleep duration
2. sleep to deadline (timeout)

TODO: support multiple types of clocks

Refs: #3295   (closed)

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200331143839.1781424-4-alejo.sanchez@scylladb.com>
2020-04-01 16:22:30 +02:00
Avi Kivity
5671b3d7d3 Update seastar sudmodule
* seastar 36e8dfc89...41c83ec55 (3):
  > api: add file_type() global function
  > json: Add backtrace information for json generation exceptions
  > scheduling: avoid defining friend namespace qualified function scheduling_group_key_id()
2020-04-01 11:16:30 +03:00
Konstantin Osipov
9948f548a5 lwt: remove Paxos from experimental list
Always enable lightweight transactions. Remove the check for the command
line switch from the feature service, assuming LWT is always enabled.

Remove the check for LWT from Alternator.

Note that in order for the cluster to work with LWT, all nodes need
to support it.

Rename LWT to UNUSED in db/config.hh, to keep accepting lwt keyword in
--experimental-features command line option, but do nothing with it.

Changes in v2:
* remove enable_lwt feature flag, it's always there

Closes #6102

test: unit (dev, debug)
Message-Id: <20200401071149.41921-1-kostja@scylladb.com>
2020-04-01 09:12:21 +02:00
Glauber Costa
87dd23db03 compaction: use a larger min_threshold during bootstrap, replace
During bootstrap and replace operations the node can't take reads and
we'd like to see the process ending ASAP. This is because until the
process ends, we keep having to duplicate writes to an extended set. Not
to mention, in the case of a cluster expansion users want to use the
added capacity sooner rather than later.

Streaming generates a lot of compaction activity, that competes with the
bootstrap itself, slowing it down.

Long term, we are moving to treat those compactions differently and
maybe postpone them altogether. However for now we can reduce the amount
of compactions by increasing the minimum threshold of SSTables that have
to accumulate before they are selected for compactions. The default is
2, meaning we will trigger a compaction every time 2 SSTables of about
the same size are found (for STCS, others follow a similar pattern).

Until we have offstrategy infrastructure we don't want the compactions
to stop happening altogether so the reads, when they start, don't
suffer.  This patch sets the minimum threshold to 16 (for the default
max_threshold of 32), meaning we will generate a lot less compaction
activity during streaming. Once streaming is done we revert it to its
original.

Unfortunately there isn't much we can do at the moment about decommission.
During decommission the nodes receiving data are also taking reads and
we don't want SSTables to accumulate.

Fixes #5109

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-04-01 10:06:27 +03:00
Glauber Costa
fdd2d9de3d schema: Default dc_local_read_repair_chance to zero
dc_local_read_repair_chance is a legacy of old times: Cassandra itself
now defaults to zero, and we should look into that too.

Most serious production clusters are either repaired through our
asynchronous repair, or don't need repair at all.

Synchronous read repair can help things converging, but it implies an
impact at query time. For clusters that are on an asynchronous repair
schedule this should not be needed.

Fixes #6109

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <20200331183418.21452-1-glauber@scylladb.com>
2020-04-01 08:27:49 +02:00
Glauber Costa
05efd6a5e9 resharding: get rid of special reshard_sstables
There is a method, reshard_sstables(), whose sole purpose is to call a
resharding compaction. There is nothing special about this method: all
the information it needs is now present in the compaction_descriptor.

This patch extend the compaction_options class to recognize resharding
compactions as well, and uses that so that make_compaction() can also
create resharding compactions.

To make that happen we have to create a compaction_descriptor object in
the resharding method. Note however that resharding works by passing an
object very close to the compaction_descriptor around. Once this patch
is merged, a logical next step is to reuse it, and avoid creating the
descriptor right before calling compact_sstables().

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-03-31 19:57:53 -04:00
Glauber Costa
e8801cd77b compaction: enhance compaction_descriptor with creator and replace function
There are many differences between resharding and compaction that are
artificial, arising more from the way we ended up implementing it than
necessity. This patch attempts to pass the creator and replacer functions
through the compaction_descriptor.

There is a difference between the creator function for resharding and
regular compaction: resharding has to pass the shard number on behalf
of which the SSTable is created. However regular compactions can just
ignore this. No need to have a special path just for this.

After this is done, the constructor for the compaction object can be
greatly simplified. In further patches I intend to simplify it a bit
further, but some more cleanup has to happen first.

To make that happen we have to construct a compaction_descriptor object
inside the resharding function. This is temporary: resharding currently
works with a descriptor, but at some point that descriptor is lost and
broken into pieces to be passed to this function. The overarching goal
of this work is exactly to be able to keep that descriptor for as long
as possible, which should simplify things a lot.

Callers are patched, but there are plenty for sstable_datafile_test.cc.
For their benefit, a helper function is provided to keep the previous
signature (test only).

Signed-off-by: Glauber Costa <glauber@scylladb.com>
2020-03-31 19:41:25 -04:00
Avi Kivity
dee0b68347 Merge 'Separate sharding and partitioning logic' from Piotr J
"
Currently, both sharding and partitioning logic is encapsulated into partitioners. This is not desirable because these two concepts are totally independent and shouldn't be coupled together in such a way.

This PR separates sharding and partitioning. Partitioning will still live in i_partitioner class and its subclasses. Sharding is extracted to a new class called sharding_info. Both partitioners and sharding_info are still managed by schema class. Partitioner can be accessed with schema::get_partitioner while sharding_info can be accessed with schema::get_sharding_info.

The transition is done in steps:
1. sharding_info class is defined and all the sharding logic is extracted from partitioner to the new class. Temporarily sharding_info is still embedded into i_partitioner and all sharding related functions in i_partitioner call delegate to the embedded sharding_info object.
2. All calls to i_partitioner functions that are related to sharding are gradually switched to calls to sharding_info equivalents. sharding_info.
3. Once everything uses sharding_info, all sharding logic is dropped from i_partitioner.

Tests: unit(dev, release)
"

* haaawk-sharding_info: (32 commits)
  dummy_sharder: rename dummy_sharding_info.* to dummy_sharder.*
  sharding_info: rename the class to sharder
  i_partitioner:remove embeded sharding_info
  i_partitioner: remove unused get_sharding_info
  schema: remove incorrect comment
  schema: make it possible to set sharding_info per schema
  i_partitioner: remove unused shard_count
  multishard_writer: stop calling i_partitioner::shard_count
  i_partitioner: remove sharding_ignore_msb
  partitioner_test: test ranges and sharding_infos
  i_partitioner: remove unused split_ranges_to_shards
  i_partitioner: remove unused shard_of function
  sstable-utils: use sharding_info::shard_of
  create_token_range_from_keys: use sharding info for shard_of
  multishard_mutation_query_test: use sharding info for shard_of
  distribute_reader_and_consume_on_shards: use sharding_info::shard_of
  multishard_mutation_query: use sharding_info::shard_of
  dht::shard_of: use schema::get_sharding_info
  i_partitioner: remove unused token_for_next_shard
  split_range_to_single_shard: use sharding info instead of partitioner
  ...
2020-03-31 13:40:51 +03:00
Alejo Sanchez
4a3b98facc utils: error injection fix deadline test timeout
Rafael reported test_inject_future_sleep_timeout_short failed
sometimes as limit is too close. Bump limit.

Refs #3295 (closed)

Repro:
./test.py --mode=dev -v boost/error_injection_test --repeat 300

Tests: unit ({dev})

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200328204454.1326514-3-alejo.sanchez@scylladb.com>
2020-03-31 11:58:38 +02:00
Alejo Sanchez
e5a2ba32b9 utils: error injection allocate string for remote invoke
Allocate string before sending to other shards.

Reported by Pavel Solodovnikov.

Refs #3295 (closed)

Tests: unit ({dev})

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200328204454.1326514-2-alejo.sanchez@scylladb.com>
2020-03-31 11:58:27 +02:00
Nadav Har'El
fe6cecb26d alternator-test: comment out an error-path test that doesn't work on newer boto3
Unfortunately, the boto3 library doen't allow us to check some of the
input error cases because it unnecessarily tests its input instead of
just passing it to Alternator and allowing Alternator to report the error.
In this patch we comment out a test case which used to work fine - i.e.,
the error was reported by Alternator - until recent changes to boto3
made it catch the problem without passing it to Alternator :-(

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200330190521.19526-2-nyh@scylladb.com>
2020-03-31 07:58:01 +02:00
Nadav Har'El
db7cebd663 alternator-test: skip one test in test_tag.py if botocore is too old
One of the Alternator tests in test_tag.py checks the feature of creating
a table with a set of tags (as opposed to adding tags to an existing table).
This is a relatively new DynamoDB feature, only added in April 2019, so if
the botocore library is too old, it cannot test this feature, and we have to
skip the test.

Alternator developers should make an effort to keep the botocore library
up-to-date and test the latest DynamoDB features, but it is less important
if some test environments (like Jenkins) cannot verify this specific test
until its distro gets updated - it is more important that the fast majority
of the tests, which do not rely on very new features, get tested.

After this patch, if running on Fedora 30 with
python3-botocore-1.12.101-2.fc30.noarch installed, we get the following
skip message:

$ pytest-3 -rs test_tag.py
...
test_tag.py ..s..x                                                                                                      [100%]
=================================================== short test summary info ===================================================
SKIP [1] /home/nyh/scylla/test/alternator/test_tag.py:114: Botocore version 1.12.136 or above required to run this test

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200330190521.19526-1-nyh@scylladb.com>
2020-03-31 07:57:53 +02:00
Gleb Natapov
8a408ac5a8 lwt: remove entries from system.paxos table after successful learn stage
The learning stage of PAXOS protocol leaves behind an entry in
system.paxos table with the last learned value (which can be large). In
case not all participants learned it successfully next round on the same
key may complete the learning using this info. But if all nodes learned
the value the entry does not serve useful purpose any longer.

The patch adds another round, "prune", which is executed in background
(limited to 1000 simultaneous instances) and removes the entry in
case all nodes replied successfully to the "learn" round.  It uses the
ballot's timestamp to do the deletion, so not to interfere with the
next round. Since deletion happens very close to previous writes it will
likely happen in memtable and will never reach sstable, so that reduces
memtable flush and compaction overhead.

Fixes #5779

Message-Id: <20200330154853.GA31074@scylladb.com>
2020-03-30 21:02:14 +03:00
Piotr Jastrzebski
c44f019eee dummy_sharder: rename dummy_sharding_info.* to dummy_sharder.*
Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
e72696a8e6 sharding_info: rename the class to sharder
Also rename all variables that were named si or sinfo
to sharder.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
2e850421a0 i_partitioner:remove embeded sharding_info
sharding_info embeded into partitioner is no longer
used anywhere and can be removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
b46b35c55a i_partitioner: remove unused get_sharding_info
Previous patches has removed all the usages of
this function.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
92cdc21123 schema: remove incorrect comment
partitioner is actually part of schema digest and
is stored locally in internal tables.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
7bd2b8d73f schema: make it possible to set sharding_info per schema
Previously schema::get_sharding_info was obtaining
sharding_info from the partitioner but we want to remove
sharding_info from the partitioner so we need a place
in schema to store it there instead.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
79adee2fae i_partitioner: remove unused shard_count
Previous patches have switched all the calls to
i_partitioner::shard_count to sharding_info::shard_count
and this function can now be removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
db3d7df893 multishard_writer: stop calling i_partitioner::shard_count
Replace it with sharding_info::shard_count.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
b7834634ee i_partitioner: remove sharding_ignore_msb
Every place that has previously called this method is now
using sharding_info::sharding_ignore_msb and this function
can be removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
fb89841cc5 partitioner_test: test ranges and sharding_infos
Turn test_something_with_some_interesting_ranges_and_partitioners
into test_something_with_some_interesting_ranges_and_sharding_info.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
2aaa33d02e i_partitioner: remove unused split_ranges_to_shards
The function is never called so it can be safely removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
bdb7e89048 i_partitioner: remove unused shard_of function
Previous patches switched all the places that called
i_partitioner::shard_of to use sharding_info::shard_of
so i_partitioner::shard_of is no longer used and can
be removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
14ad965733 sstable-utils: use sharding_info::shard_of
Create sharding_info with the same parameters as
the partitioner and use it instead of the partitioner.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
dc2e060313 create_token_range_from_keys: use sharding info for shard_of
Replace i_partitioner::shard_of with sharding_info::shard_of

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
c50f7f8143 multishard_mutation_query_test: use sharding info for shard_of
Uses sharding_info::shard_of instead of i_partitioner::shard_of.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
8aabba6041 distribute_reader_and_consume_on_shards: use sharding_info::shard_of
Switches all uses of i_partitioner::shard_of to sharding_info::shard_of.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
d8ac8fd6e8 multishard_mutation_query: use sharding_info::shard_of
This patch replaces all the uses of i_partitioner:shard_of
with sharding_info::shard_of in read_context.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
88364b6c30 dht::shard_of: use schema::get_sharding_info
i_partitioner::shard_of will be removed so we should
use sharding_info::shard_of instead.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
8b6be90310 i_partitioner: remove unused token_for_next_shard
Previous patches have switched all the places that was
using i_partitioner::token_for_next_shard to
sharding_info::token_for_next_shard. Now the function
can be removed from i_partitioner.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
8a6c377352 split_range_to_single_shard: use sharding info instead of partitioner
The function relies only on i_partitioner::shard_count
and i_partitioner::token_fon_next_shard. Both are really implemented
in sharding_info so the method can use them directly.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
c5d0887471 schema_builder: remove unused with_partitioner_for_tests_only
After previous patches that switched some tests to use sharding_info
instead of i_partitioner, we now don't need with_partitioner_for_tests_only
and the function can be removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
41591f15d2 tests: rename dummy_partitioner.* to dummy_sharding_info.*
dummy_partitioner was renamed to dummy_sharding_info in
the previous patch. This patch cleans up the names of
files. It's done in a separate patch to not obstruct
the diff of previous patch.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:33 +02:00
Piotr Jastrzebski
031f589dba multishard_combining_reader: use token_for_next_shard from sharding info not partitioner
Previously this function was accessing sharding logic
through partitioner obtained from the schema.

While converting tests, dummy_partitioner is turned into
dummy_sharding_info.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 18:42:25 +02:00
Tomasz Grabiec
f2b091967b Merge "migration_manager: Make sync_schema return error when node is down" from Asias
sync_schema is supposed to make sure that this node knows about all
schema changes known by "nodes" that were made prior to this call.

Currently, when a node is down, the sync is sliently skipped.

To fix, add a flag to migration_task::run_may_throw to indicate that it
should fail if a node is down.

Fixes #4791
2020-03-30 17:31:57 +02:00
Gleb Natapov
b3db6f5b04 lwt: rename "in_progress_ballot" cell to "promise" in system.paxos table
The value that is stored in "in_progress_ballot" cell is the value of
promised ballot, so call the cell accordingly to avoid confusion
especially as we have a notion of "in progress" proposal in the code
which is not the same as in_progress_ballot here.

We can still do it without care about backwards compatibility since LWT
is still marked as experimental.

Fixes #6087.

Message-Id: <20200326095758.GA10219@scylladb.com>
2020-03-30 12:01:55 +03:00
Avi Kivity
fba6db4a43 Update seastar submodule
* seastar 06a8c8f6e...36e8dfc89 (1):
  > reactor: decouple idle cpu handler from reactor

Ref #1.
2020-03-30 10:49:12 +03:00
Piotr Jastrzebski
274a045649 partitioner_test: use token_for_next_shard from sharding info not partitioner
partitioner_test contains test_partitioner_sharding function
which this patch renames to test_sharding and makes it
use sharding_info instead of the partitioner.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:37:48 +02:00
Piotr Jastrzebski
a3262a2cb2 repair: depend only on sharding logic not on partitioner
repair does not use partitioner and only uses sharding logic.
This means it does not have to depend on i_partitioner and can
instead operate on sharding_info.

This has an important consequence of allowing the repair of
multiple tables having different partitioners at the same time.

All tables repaired together still have to use the same
sharding logic.

To achieve this the change:
1. Removes partitioner field from repair_info
2. repair_info has access to sharding_info through schema
   objects of repaired tables
3. partitioner name is removed from shard_config
4. local and remote partitioners are removed from repair_meta.
   Remote sharding_info is used instead.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:37:48 +02:00
Piotr Jastrzebski
dffa9fc880 dht: remove unimplemented split_range_to_single_shard
This method is not implemented anywhere not to mention the usage.
It is the only resonable thing to remove it instead of keeping
an unused and unimplemented declaration.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:36:22 +02:00
Piotr Jastrzebski
94ff653b99 selective_token_range_sharder: replace i_partitioner with sharding_info
The class does not depend on partitioning logic but only uses
sharding logic. This means it is possible and desirable to limit its
dependency to only sharding_info.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:36:22 +02:00
Piotr Jastrzebski
ecff322fd5 ring_position_range_vector_sharder: replace i_partitioner with sharding_info
ring_position_range_vector_sharder does not depend on partitioning logic.
It only uses sharding logic so it is not necessary to store i_partitioner
in the class. Reference to sharding_info is enough.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:35:27 +02:00
Piotr Jastrzebski
8a4c1be129 ring_position_range_sharder: replace i_partitioner with sharding_info
ring_position_range_sharder does not depend on partitioning at all.
It only uses sharding so it is enough for the class to take sharding_info
instead of a whole i_partitioner. This patch changes ring_position_range_sharder
class to contain const sharding_info& instead of const i_partitioner&.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:35:27 +02:00
Piotr Jastrzebski
52fe241311 dht: remove unused ring_position_exponential_sharder
The class is not used anywhere so it can be safely removed.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:35:27 +02:00
Piotr Jastrzebski
8d81a2498f schema: add get_sharding_info
At the moment, we have a single sharding logic per node
but we want to be able to set it per table in the future.
To make it easy to change in the future sharding_info
will be managed inside schema and all the other code
will access it through schema::get_sharding_info function.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:35:27 +02:00
Piotr Jastrzebski
ca07f8e84d partitioner: extract sharding fields to a class
This patch creates a new class called sharding_info.
This new class will now be responsible for all
the sharding logic that before was a part of the partitioner.

In the end, sharding and partitioning logic will be fully
separated but this patch starts with just extracting sharding
logic to sharding_info and embedding it into i_partitioner class.
All sharding functions are still present in i_partitioner but now
they just delegate to the corresponding functions of the embedded
sharding_info object.
Following patches will gradually switch all uses of the following
i_partitioner member functions to their equivalents in sharding_info:
1. shard_of
2. token_for_next_shard
3. sharding_ignore_msb
4. shard_count
After that, sharding_info will be removed from i_partitioner and
the two classes will be totally independent.

Signed-off-by: Piotr Jastrzebski <piotr@scylladb.com>
2020-03-30 09:35:27 +02:00
Asias He
ef64f52152 migration_manager: Do not swallow exception in migration_task::run_may_throw
The user migration_manager::submit_migration_task needs to know if
migration_task::run_may_throw is successful or not.

Do not swallow exception.

Fixes #4791
2020-03-30 14:50:01 +08:00
Avi Kivity
68750b777e priority_manager: deinline constructor
Make the constructor out-of-line and clean up includes made redundant.
This removes an include of Seastar's heavy reactor.hh from a header.

Ref #1
Message-Id: <20200329173711.16949-1-avi@scylladb.com>
2020-03-30 09:34:18 +03:00
Avi Kivity
3159ad4484 Update seastar submodule
* seastar c7b6b84e5...06a8c8f6e (12):
  > scheduling_group_specific: remove inclusion of reactor.hh
  > future: Delete void_futurize_helper
  > future: Delete unused do_void_futurize_helper instantiation
  > core: remove io_queue queued requests metric
  > future: Add assert to set_urgent_state
  > future: Add a comment to set_urgent_state
  > future: Use placement new instead of operator= in set_urgent_state
  > file: use correct io_queue in dup()d files
  > io_queue: fix miscalculation of sizes when I/O queue is not configured.
  > merge: Add log levels to RPC loggers
  > reactor: Replace a call to cpu_id with this_shard_id()
  > reactor: Drop a few redundant calls to engine()
2020-03-29 15:37:45 +03:00
Botond Dénes
0d224210bb database: apply_in_memory(): don't look-up the column-family twice
The column-family is already looked up as the first line in the method.
No need to repeat that lookup in the lambda passed to
`run_when_memory_available()`, we can just capture the reference to the
already obtained column-family object. These objects are safe to
reference, they don't just disappear in the middle of an operation.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200327140827.128647-1-bdenes@scylladb.com>
2020-03-27 15:19:32 +01:00
Asias He
743b529c2b gossip: Add an option to force gossip generation
Consider 3 nodes in the cluster, n1, n2, n3 with gossip generation
number g1, g2, g3.

n1, n2, n3 running scylla version with commit
0a52ecb6df (gossip: Fix max generation
drift measure)

One year later, user wants the upgrade n1,n2,n3 to a new version

when n3 does a rolling restart with a new version, n3 will use a
generation number g3'. Because g3' - g2 > MAX_GENERATION_DIFFERENCE and
g3' - g1 > MAX_GENERATION_DIFFERENCE, so g1 and g2 will reject n3's
gossip update and mark g3 as down.

Such unnecessary marking of node down can cause availability issues.
For example:

DC1: n1, n2
DC2: n3, n4

When n3 and n4 restart, n1 and n2 will mark n3 and n4 as down, which
causes the whole DC2 to be unavailable.

To fix, we can start the node with a gossip generation within
MAX_GENERATION_DIFFERENCE difference for the new node.

Once all the nodes run the version with commit
0a52ecb6df, the option is no logger
needed.

Fixes #5164
2020-03-27 12:15:21 +01:00
Rafael Ávila de Espíndola
c5795e8199 everywhere: Replace engine().cpu_id() with this_shard_id()
This is a bit simpler and might allow removing a few includes of
reactor.hh.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200326194656.74041-1-espindola@scylladb.com>
2020-03-27 11:40:03 +03:00
Nadav Har'El
c639a5ec6f merge: fix two CDC bugs with preimage/postimage
Merged pull request https://github.com/scylladb/scylla/pull/6078 from
Calle Wilund, fixing two CDC preimage/postimage bugs:

Fixes #6073.
Fixes #6070.
2020-03-26 17:38:18 +02:00
Alejo Sanchez
cb26de89a1 tests: port Cassandra CQL tests to cql repl
Port CQL only tests to cql repl from:
  cassandra-dtest/cql_test.py
  cassandra/test/unit/org/apache/cassandra/cql3/validation/operations/BatchTest.java

Refs #5792

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200326103223.1097192-2-alejo.sanchez@scylladb.com>
2020-03-26 15:19:38 +02:00
Alejo Sanchez
febcced4f1 utils: error injection with timeout/deadline
Most of Scylla code runs with a user-supplied query timeout, expressed as
absolute clock (deadline). When injecting test sleeps into such code, we most
often want to not sleep beyond the user supplied deadline. Extend error
injection API to optionally accept a deadline, and, if it is provided,
sleep no more than up to the deadline. If current time is beyond deadline,
sleep injection is skipped altogether.

Signed-off-by: Alejo Sanchez <alejo.sanchez@scylladb.com>
Message-Id: <20200326091600.1037717-2-alejo.sanchez@scylladb.com>
2020-03-26 12:41:10 +01:00
Piotr Sarna
6bcc46b08a cql3: add missing error message context to query processor
When caching a prepared statement fails, an error is logged,
but due to a typo it only prints "failed to cache the entry",
ignoring the specific error message - which this patch fixes.

Message-Id: <9c3c1d9c11d559815268fa977c1fb80b8c4459ca.1585213673.git.sarna@scylladb.com>
2020-03-26 12:46:03 +02:00
Piotr Sarna
1178ac5564 test: move config to heap in sstable_resharding_test
... in order to get rid of a large stack warning.
Tests: unit(dev)
Message-Id: <bca0f854f4e338316c109364257a740a36821b0a.1585129083.git.sarna@scylladb.com>
2020-03-25 14:58:16 +01:00
Piotr Sarna
5ef9dbfa8a test: move config to heap in schema_registry_test
... in order to get rid of a large stack warning.
Tests: unit(dev)

Message-Id: <82b55e8440ade8a3d81880dd66127776b2661112.1585128726.git.sarna@scylladb.com>
2020-03-25 14:19:30 +01:00
Nadav Har'El
a0f025f4ce sstable: LA format is the default, so ignore "LA_SSTABLE" feature flag
The previous patch made the LA format the default. We no longer need to
choose between writing the older KA format or LA, so the LA_SSTABLE
cluster feature has became unnecessary.

Unfortunately, we cannot completely remove this feature: Since commit
4f3ce42163 we cannot remove cluster features
because this node will refuse to join a cluster which already agreed on
features that it lacks - thinking it is an old node trying to join a
new cluster.

So the LA_SSTABLE feature flag remains, and we continue to advertise
that our node supports it. We just no longer care about what other
nodes advertised for it, so we can remove a bit of code that cared.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200324232607.4215-3-nyh@scylladb.com>
2020-03-25 13:00:28 +01:00
Nadav Har'El
91aba40114 sstable: default to LA format instead of KA format
Over the years, Scylla updated the sstable format from the KA format to
the LA format, and most recently to the MC format. On a mixed cluster -
as occurs during a rolling upgrade - we want all the nodes, even new ones,
to write sstables in the format preferred by the old version. The thinking
is that if the upgrade fails, and we want to downgrade all nodes back to
the older version, we don't want to lose data because we already have
too-new sstables.

So the current code starts by selecting the oldest format we ever had - KA,
and only switching this choice to LA and MC after we verify that all the
nodes in the cluster support these newer formats.

But before an agreement is reached on the new format, sstables may already
be created in the antique KA format. This is usually harmless - we can
read this format just fine. However, the KA format has a problem that it is
unable to represent table names or keyspaces with the "-" character in them,
because this character is used to separate the keyspace and table names in
the file name. For CQL, a "-" is not allowed anyway in keyspace or table
names; But for Alternator, this character is allowed - and if a KA table
happens to be created by accident (before the LA or MC formats are chosen),
it cannot be read again during boot, and Scylla cannot reboot.

The solution that this patch takes is to change Scylla's default sstable
format to LA (and, as before, if the entire cluster agrees, the newer MC
format will be used). From now on, new KA tables will never be written.
But we still fully support *reading* the KA format - this is important in
case some very old sstables never underwent compaction.

The old code had, confusingly, two places where the default KA format
was chosen. This patch fixes is so the new default (LA) is specified in
only one place.

Fixes #6071.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
Message-Id: <20200324232607.4215-2-nyh@scylladb.com>
2020-03-25 13:00:28 +01:00
Rafael Ávila de Espíndola
eca0ac5772 everywhere: Update for deprecated apply functions
Now apply is only for tuples, for varargs use invoke.

This depends on the seastar changes adding invoke.

Signed-off-by: Rafael Ávila de Espíndola <espindola@scylladb.com>
Message-Id: <20200324163809.93648-1-espindola@scylladb.com>
2020-03-25 08:49:53 +02:00
Avi Kivity
088660680c Update seastar submodule
* seastar 92c488706...c7b6b84e5 (6):
  > semaphore: Use futurize_invoke instead of futurize_apply
  > future: specify futurize::make_exception_future as noexcept
  > future: Move ignore out of line
  > future: Split then and then_impl to enable NRVO
  > semaphore_units: allow getting the number of units held
  > Merge "Split futurize::apply into invoke(...) and apply(tuple)" from Rafael
2020-03-25 08:48:00 +02:00
Asias He
7ba821cbc0 migration_manager: Make sync_schema return error when node is down
sync_schema is supposed to make sure that this node knows about all
schema changes known by "nodes" that were made prior to this call.

Currently, when a node is down, the sync is sliently skipped.

To fix, add a flag to migration_task::run_may_throw to indicate that it
should fail if a node is down.

Fixes #4791
2020-03-25 10:59:13 +08:00
Calle Wilund
532a8634c6 cdc::log: Only generate pre/post-image when enabled
Fixes #6073

The logic with pre/post image was tangled into looking at "rs"
and would cause pre-image info to be stored even if only post-image
data was enabled.

Now only generate keys (and rows for them) iff explicitly enabled.
And only generate pre-image key iff we have pre-image data.
2020-03-24 15:32:30 +00:00
Calle Wilund
881ebe192b cdc::log: Handle non-atomic column assignments broken into two
Fixes #6070

When mutation splitting was added, non-atomic column assignments were broken
into two invocation of transform. This means the second (actual data assignment)
does not know about the tombstone in first one -> postimage is created as if
we were _adding_ to the collection, not replacing it.

While not pretty, we can handle this knowing that we always get
invoked in timestamp order -> tombstone first, then assign.
So we simply keep track of non-atomic columns deleted across calls
and filter out preimage data post this.

Added test cases for all non-atomics
2020-03-24 14:07:13 +00:00
Botond Dénes
0418a74fa9 querier: consume_page(): resolve FIXME related to non-movable consumer
Now that #3158 is fixed, we can move the consumer to its place after
the `compaction_mutation_state::start_new_page()` call. No need to keep
it as `std::unique_ptr<>`.

Signed-off-by: Botond Dénes <bdenes@scylladb.com>
Message-Id: <20200310185147.207665-1-bdenes@scylladb.com>
2020-03-24 15:28:42 +02:00
Avi Kivity
a314283469 Merge "Minor cleanups to cql3 code regarding shared_ptr's" from Pavel S
"
This small series consists of several changes that aim to
reduce the number of shared_ptr's in cql3 code.

Also it contains a patch that makes CqlParser::query to return
std::unique_ptr<> instead of seastar::shared_ptr<>, which leads
to more understandable code and lays foundation for further
optimizations (e.g. possibly eliminating shared_ptr's in
`prepared_statement` and just moving raw statements in `prepare`
without copying them).

Tests: unit(dev, debug)
"

* 'feature/cql_cleanups_9' of https://github.com/ManManson/scylla:
  cql3: return raw::parsed_statement as unique_ptr
  cql3: de-pointerize arguments to some of CQL grammar rules and definitions.
  cql3: make abstract_marker::make_in_receiver accept cref to column_specification
2020-03-24 14:51:49 +02:00
Calle Wilund
9fee712d62 db::commitlog: Don't write trailing zero block unless needed
Fixes #5899

When terminating (closing) a segment, we write a trailing block
of zero so reader can have an empty region after last used chunk
as end marker. This is due to using recycled, pre-allocated
segments with potentially non-zero data extending over the point
where we are ending the segment (i.e. we are not fully filling
the segment due to a huge mutation or similar).

However, if we reach end of segment writing the final block
(typically many small mutations), the file will end naturally
after the data written, and any trailing zero block would in fact
just extend the file further. While this will only happen once per
segment recycled (independent on how many times it is recycled),
it is still both slightly breaking the disk usage contract and
also potentially causing some disk stalls due to metadata changes
(though of course very infrequent).

We should only write trailing zero if we are below the max_size
file size when terminating

Adds a small size check to commitlog test to verify size bounds.
(Which breaks without the patch)

v2:
- Fix test to take into account that files might be deleted
  behind our backs.
v3:
- Fix test better, by doing verification _before_ segments are
  queued for delete.

Message-Id: <20200226121601.15347-2-calle@scylladb.com>
Message-Id: <20200324100235.23982-1-calle@scylladb.com>
2020-03-24 11:31:55 +01:00
Pavel Solodovnikov
adc6a98b59 cql3: return raw::parsed_statement as unique_ptr
Change CQL parsing routine to return std::unique_ptr
instead of seastar::shared_ptr.

This can help reduce redundant shared_ptr copies even further.

Make some supplementary changes necessary for this transition:
 * Remove enabled_shared_from_this base class from the following
   classes: truncate_statement, authorization_statement,
   authentication_statement: these were previously constructing
   prepared_statement instance in `prepare` method using
   `shared_from_this`.
   Make `prepare` methods implementation of inheriting classes
   mirror implementation from other statements (i.e.
   create a shallow copy of the object when prepairing into
   `prepared_statement`; this could be further refactored
   to avoid copies as much as possible).
 * Remove unused fields in create_role_statement which led to
   error while using compiler-generated copy ctor (copying
   uninitialied bool values via ctor).

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
2020-03-23 23:19:21 +03:00
Pavel Solodovnikov
df1d687fc6 cql3: de-pointerize arguments to some of CQL grammar rules and definitions.
Make the following rules and definitions accept a reference
instead of shared_ptr's:
 * cfamDefinition
 * cfamColumns
 * pkDef
 * typeColumns
 * ksName
 * cfName
 * idxName
 * properties
 * property

This will reduce a bit the number of countless shared_ptr copies
and moves all over the place in cql3 code.

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
2020-03-23 23:19:21 +03:00
Pavel Solodovnikov
279b52f275 cql3: make abstract_marker::make_in_receiver accept cref to column_specification
These methods just extract some info out of
column_specification, so no need have another copy of
shared_ptr since it's not stored anywhere inside.

Transform abstract_marker::in_raw::make_in_receiver as well
following the call chain.

Signed-off-by: Pavel Solodovnikov <pa.solodovnikov@scylladb.com>
2020-03-23 23:19:21 +03:00
Botond Dénes
82a019b6e2 scylla-gdb.py: scylla generate_object_graph: make label of initial vertice bold
So it is easily identifiable. Also, generally improve the readability of
labels by moving type names into a new line.
2020-03-19 16:04:03 +02:00
Botond Dénes
a4eb9b8559 scylla-gdb.py: scylla generate_object_graph: remove redundant lookup
Currently the initial vertice of the graph is resolved in both
`_traverse_object_graph_breadth_first()` and its caller
`_do_generate_object_graph()`. This is redundant, so remove the
resolving in the latter.
2020-03-19 16:04:03 +02:00
Botond Dénes
7cb3cc23e6 scylla-gdb.py: scylla generate_object_graph: print "to" offsets
Currently, for edges, only the "from" offset is printed, that is the
offset of the reference in the originating object. Now that we also scan
the non-first word of objects for references to them, we can have
reference pointing to the non-first word of objects. To make these
apparent, also print the "to" offset on edges, that is the offset into
the target object where the reference point to. So now edges have tuple
labels: (from, to).
2020-03-19 16:01:59 +02:00
Botond Dénes
d2dfb6509c scylla-gdb.py: scylla generate-object-graph: use value-range to find references
When looking for references to an object in the graph, look for
references to any part of the object, using `scylla_find.find()`:s new
`value_range` parameter.

This way, the graph can be extended beyond objects that are members of
an intrusive containers, or just generally don't have any references to
their very first byte.

Allow the user to specify a value-range different than the size of the
object. This is useful if it is known that references to the object will
point to the first N bytes.
2020-03-19 15:41:48 +02:00
Botond Dénes
326c2a408a scylla-gdb.py: scylla find: allow finding ranges of values
One of the most common use-cases of find is finding references to an
object. This works great for normal objects, however not for all of
them, a prominent example being objects that are members of an intrusive
collections. These objects will have pointers to them that don't point
to their first byte, instead they point to somewhere in the middle of
the object. To help find such references, find now supports searching
for a range of values. If the new `--value-range` option is used, it
will start searching for the value itself, and if no usages are found it
will increment it with the specified size-class, and search again. This
is repeated until some usages are found or the range is depleted.
`scylla_find.find()` now returns the offset to the value, of which
usages were found. Alternatively one can scan the entire value-range
using the `--find-all` option. When this is used, `scylla_find` will not
stop on the first offset for which references are found.
2020-03-19 15:41:48 +02:00
Botond Dénes
6bf3a0ae8a scylla-gdb.py: find_in_live(): return pointer_metadata instances
find_in_live() currently parses back the output of `scylla ptr`, to
return the address to the beginning of the object and the offset. All
its current callers do the call to `scylla ptr` again to obtain further
information about the object. To avoid this duplicated effort, return
`pointer_metadata` instances from `find_in_live()`, obtained via
`scylla_ptr.analyze()` which is the python API to `scylla ptr`.
2020-03-19 15:41:47 +02:00
1755 changed files with 16298 additions and 5236 deletions

8
.gitmodules vendored
View File

@@ -1,17 +1,17 @@
[submodule "seastar"]
path = seastar
url = ../seastar
url = ../scylla-seastar
ignore = dirty
[submodule "swagger-ui"]
path = swagger-ui
url = ../scylla-swagger-ui
ignore = dirty
[submodule "xxHash"]
path = xxHash
url = ../xxHash
[submodule "libdeflate"]
path = libdeflate
url = ../libdeflate
[submodule "zstd"]
path = zstd
url = ../zstd
[submodule "abseil"]
path = abseil
url = ../abseil-cpp

View File

@@ -1,5 +1,7 @@
This project includes code developed by the Apache Software Foundation (http://www.apache.org/),
especially Apache Cassandra.
It also includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
It includes files from https://github.com/antonblanchard/crc32-vpmsum (author Anton Blanchard <anton@au.ibm.com>, IBM).
These files are located in utils/arch/powerpc/crc32-vpmsum. Their license may be found in licenses/LICENSE-crc32-vpmsum.TXT.
It includes modified code from https://gitbox.apache.org/repos/asf?p=cassandra-dtest.git (owned by The Apache Software Foundation)

View File

@@ -1,7 +1,7 @@
#!/bin/sh
PRODUCT=scylla
VERSION=666.development
VERSION=4.1.11
if test -f version
then

1
abseil Submodule

Submodule abseil added at 2069dc796a

View File

@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
time_t time_point_repr = db_clock::to_time_t(tp);
std::string time_point_str;
time_point_str.resize(17);
::tm time_buf;
// strftime prints the terminating null character as well
std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
time_point_str.resize(16);
return time_point_str;
}
@@ -128,7 +129,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);
auto cl = auth::password_authenticator::consistency_for_user(username);
auto timeout = auth::internal_distributed_timeout_config();
auto& timeout = auth::internal_distributed_timeout_config();
return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
auto res = f.get0();
auto salted_hash = std::optional<sstring>();

View File

@@ -141,6 +141,11 @@ struct nonempty : public size_check {
// Check that array has the expected number of elements
static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
if (!array && expected(0)) {
// If expected() allows an empty AttributeValueList, it is also fine
// that it is missing.
return;
}
if (!array || !array->IsArray()) {
throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
}
@@ -365,31 +370,35 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
struct cmp_lt {
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
// We cannot use the normal comparison operators like "<" on the bytes
// type, because they treat individual bytes as signed but we need to
// compare them as *unsigned*. So we need a specialization for bytes.
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
static constexpr const char* diagnostic = "LT operator";
};
struct cmp_le {
// bytes only has <, so we cannot use <=.
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
static constexpr const char* diagnostic = "LE operator";
};
struct cmp_ge {
// bytes only has <, so we cannot use >=.
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
static constexpr const char* diagnostic = "GE operator";
};
struct cmp_gt {
// bytes only has <, so we cannot use >.
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
static constexpr const char* diagnostic = "GT operator";
};
// True if v is between lb and ub, inclusive. Throws if lb > ub.
template <typename T>
bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
if (ub < lb) {
if (cmp_lt()(ub, lb)) {
throw api_error("ValidationException",
format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
}

View File

@@ -187,6 +187,25 @@ static schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& r
}
}
static std::tuple<bool, std::string_view, std::string_view> try_get_internal_table(std::string_view table_name) {
size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX);
if (it != 0) {
return {false, "", ""};
}
table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size());
size_t delim = table_name.find_first_of('.');
if (delim == std::string_view::npos) {
return {false, "", ""};
}
std::string_view ks_name = table_name.substr(0, delim);
table_name.remove_prefix(ks_name.size() + 1);
// Only internal keyspaces can be accessed to avoid leakage
if (!is_internal_keyspace(sstring(ks_name))) {
return {false, "", ""};
}
return {true, ks_name, table_name};
}
// get_table_or_view() is similar to to get_table(), except it returns either
// a table or a materialized view from which to read, based on the TableName
// and optional IndexName in the request. Only requests like Query and Scan
@@ -196,6 +215,17 @@ static std::pair<schema_ptr, table_or_view_type>
get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
table_or_view_type type = table_or_view_type::base;
std::string table_name = get_table_name(request);
auto [is_internal_table, internal_ks_name, internal_table_name] = try_get_internal_table(table_name);
if (is_internal_table) {
try {
return { proxy.get_db().local().find_schema(sstring(internal_ks_name), sstring(internal_table_name)), type };
} catch (no_such_column_family&) {
throw api_error("ResourceNotFoundException",
format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
}
}
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
const rjson::value* index_name = rjson::find(request, "IndexName");
std::string orig_table_name;
@@ -208,12 +238,11 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
throw api_error("ValidationException",
format("Non-string IndexName '{}'", index_name->GetString()));
}
}
// If no tables for global indexes were found, the index may be local
if (!proxy.get_db().local().has_schema(keyspace_name, table_name)) {
type = table_or_view_type::lsi;
table_name = lsi_name(orig_table_name, index_name->GetString());
// If no tables for global indexes were found, the index may be local
if (!proxy.get_db().local().has_schema(keyspace_name, table_name)) {
type = table_or_view_type::lsi;
table_name = lsi_name(orig_table_name, index_name->GetString());
}
}
try {
@@ -544,29 +573,66 @@ static bool validate_legal_tag_chars(std::string_view tag) {
return std::all_of(tag.begin(), tag.end(), &is_legal_tag_char);
}
static const std::unordered_set<std::string_view> allowed_write_isolation_values = {
"f", "forbid", "forbid_rmw",
"a", "always", "always_use_lwt",
"o", "only_rmw_uses_lwt",
"u", "unsafe", "unsafe_rmw",
};
static void validate_tags(const std::map<sstring, sstring>& tags) {
static const std::unordered_set<std::string_view> allowed_values = {
"f", "forbid", "forbid_rmw",
"a", "always", "always_use_lwt",
"o", "only_rmw_uses_lwt",
"u", "unsafe", "unsafe_rmw",
};
auto it = tags.find(rmw_operation::WRITE_ISOLATION_TAG_KEY);
if (it != tags.end()) {
std::string_view value = it->second;
elogger.warn("Allowed values count {} {}", value, allowed_values.count(value));
if (allowed_values.count(value) == 0) {
if (allowed_write_isolation_values.count(value) == 0) {
throw api_error("ValidationException",
format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_values));
format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
}
}
}
static rmw_operation::write_isolation parse_write_isolation(std::string_view value) {
if (!value.empty()) {
switch (value[0]) {
case 'f':
return rmw_operation::write_isolation::FORBID_RMW;
case 'a':
return rmw_operation::write_isolation::LWT_ALWAYS;
case 'o':
return rmw_operation::write_isolation::LWT_RMW_ONLY;
case 'u':
return rmw_operation::write_isolation::UNSAFE_RMW;
}
}
// Shouldn't happen as validate_tags() / set_default_write_isolation()
// verify allow only a closed set of values.
return rmw_operation::default_write_isolation;
}
// This default_write_isolation is always overwritten in main.cc, which calls
// set_default_write_isolation().
rmw_operation::write_isolation rmw_operation::default_write_isolation =
rmw_operation::write_isolation::LWT_ALWAYS;
void rmw_operation::set_default_write_isolation(std::string_view value) {
if (value.empty()) {
throw std::runtime_error("When Alternator is enabled, write "
"isolation policy must be selected, using the "
"'--alternator-write-isolation' option. "
"See docs/alternator/alternator.md for instructions.");
}
if (allowed_write_isolation_values.count(value) == 0) {
throw std::runtime_error(format("Invalid --alternator-write-isolation "
"setting '{}'. Allowed values: {}.",
value, allowed_write_isolation_values));
}
default_write_isolation = parse_write_isolation(value);
}
// FIXME: Updating tags currently relies on updating schema, which may be subject
// to races during concurrent updates of the same table. Once Scylla schema updates
// are fixed, this issue will automatically get fixed as well.
enum class update_tags_action { add_tags, delete_tags };
static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
if (action == update_tags_action::add_tags) {
for (auto it = tags.Begin(); it != tags.End(); ++it) {
const rjson::value& key = (*it)["Key"];
@@ -593,24 +659,12 @@ static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::ma
}
validate_tags(tags_map);
std::stringstream serialized_tags;
serialized_tags << '{';
for (auto& tag_entry : tags_map) {
serialized_tags << format("'{}':'{}',", tag_entry.first, tag_entry.second);
}
std::string serialized_tags_str = serialized_tags.str();
if (!tags_map.empty()) {
serialized_tags_str[serialized_tags_str.size() - 1] = '}'; // trims the last ',' delimiter
} else {
serialized_tags_str.push_back('}');
}
sstring req = format("ALTER TABLE \"{}\".\"{}\" WITH {} = {}",
schema->ks_name(), schema->cf_name(), tags_extension::NAME, serialized_tags_str);
return db::execute_cql(std::move(req)).discard_result();
schema_builder builder(schema);
builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
}
static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
const rjson::value* tags = rjson::find(request_info, "Tags");
if (!tags || !tags->IsArray()) {
return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
@@ -620,7 +674,7 @@ static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson
}
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
return update_tags(rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
}
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
@@ -632,7 +686,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
return api_error("AccessDeniedException", "Incorrect resource identifier");
}
schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
add_tags(_proxy, schema, request).get();
add_tags(_mm, _proxy, schema, request).get();
return json_string("");
});
}
@@ -653,7 +707,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
update_tags(*tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
return json_string("");
});
}
@@ -681,10 +735,25 @@ future<executor::request_return_type> executor::list_tags_of_resource(client_sta
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
}
static future<> wait_for_schema_agreement(db::timeout_clock::time_point deadline) {
return do_until([deadline] {
if (db::timeout_clock::now() > deadline) {
throw std::runtime_error("Unable to reach schema agreement");
}
return service::get_local_migration_manager().have_schema_agreement();
}, [] {
return seastar::sleep(500ms);
});
}
future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
_stats.api_operations.create_table++;
elogger.trace("Creating table {}", request);
std::string table_name = get_table_name(request);
if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
return make_ready_future<request_return_type>(api_error("ValidationException",
format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
}
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
const rjson::value& attribute_definitions = request["AttributeDefinitions"];
@@ -864,15 +933,17 @@ future<executor::request_return_type> executor::create_table(client_state& clien
}
return create_keyspace(keyspace_name).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
return futurize_apply([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
return parallel_for_each(std::move(view_builders), [schema] (schema_builder builder) {
return service::get_local_migration_manager().announce_new_view(view_ptr(builder.build()));
}).then([this, table_info = std::move(table_info), schema] () mutable {
future<> f = make_ready_future<>();
if (rjson::find(table_info, "Tags")) {
f = add_tags(_proxy, schema, table_info);
f = add_tags(_mm, _proxy, schema, table_info);
}
return f.then([table_info = std::move(table_info), schema] () mutable {
return f.then([] {
return wait_for_schema_agreement(db::timeout_clock::now() + 10s);
}).then([table_info = std::move(table_info), schema] () mutable {
rjson::value status = rjson::empty_object();
supplement_table_info(table_info, *schema);
rjson::set(status, "TableDescription", std::move(table_info));
@@ -900,15 +971,24 @@ class attribute_collector {
void add(bytes&& name, atomic_cell&& cell) {
collected.emplace(std::move(name), std::move(cell));
}
void add(const bytes& name, atomic_cell&& cell) {
collected.emplace(name, std::move(cell));
}
public:
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
}
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
}
void del(bytes&& name, api::timestamp_type ts) {
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
}
void del(const bytes& name, api::timestamp_type ts) {
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
}
collection_mutation_description to_mut() {
collection_mutation_description ret;
for (auto&& e : collected) {
@@ -988,7 +1068,7 @@ public:
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
// put_or_delete_item doesn't keep a reference to schema (so it can be
// moved between shards for LWT) so it needs to be given again to build():
mutation build(schema_ptr schema, api::timestamp_type ts);
mutation build(schema_ptr schema, api::timestamp_type ts) const;
const partition_key& pk() const { return _pk; }
const clustering_key& ck() const { return _ck; }
};
@@ -1017,20 +1097,29 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
}
}
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
mutation m(schema, _pk);
auto& row = m.partition().clustered_row(*schema, _ck);
// If there's no clustering key, a tombstone should be created directly
// on a partition, not on a clustering row - otherwise it will look like
// an open-ended range tombstone, which will crash on KA/LA sstable format.
// Ref: #6035
const bool use_partition_tombstone = schema->clustering_key_size() == 0;
if (!_cells) {
// a DeleteItem operation:
row.apply(tombstone(ts, gc_clock::now()));
if (use_partition_tombstone) {
m.partition().apply(tombstone(ts, gc_clock::now()));
} else {
// a DeleteItem operation:
m.partition().clustered_row(*schema, _ck).apply(tombstone(ts, gc_clock::now()));
}
return m;
}
// else, a PutItem operation:
auto& row = m.partition().clustered_row(*schema, _ck);
attribute_collector attrs_collector;
for (auto& c : *_cells) {
const column_definition* cdef = schema->get_column_definition(c.column_name);
if (!cdef) {
attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
attrs_collector.put(c.column_name, c.value, ts);
} else {
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
}
@@ -1048,7 +1137,11 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
// Scylla proper, to implement the operation to replace an entire
// collection ("UPDATE .. SET x = ..") - see
// cql3::update_parameters::make_tombstone_just_before().
row.apply(tombstone(ts-1, gc_clock::now()));
if (use_partition_tombstone) {
m.partition().apply(tombstone(ts-1, gc_clock::now()));
} else {
row.apply(tombstone(ts-1, gc_clock::now()));
}
return m;
}
@@ -1090,13 +1183,6 @@ static lw_shared_ptr<query::read_command> previous_item_read_command(schema_ptr
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
}
static lw_shared_ptr<query::read_command> read_nothing_read_command(schema_ptr schema) {
// Note that because this read-nothing command has an empty slice,
// storage_proxy::query() returns immediately - without any networking.
auto partition_slice = query::partition_slice({}, {}, {}, query::partition_slice::option_set());
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
}
static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
return dht::partition_range_vector{dht::partition_range(dht::decorate_key(schema, pk))};
}
@@ -1142,10 +1228,10 @@ rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& reque
// the request).
}
std::optional<mutation> rmw_operation::apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) {
if (qr.row_count()) {
std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) {
if (qr->row_count()) {
auto selection = cql3::selection::selection::wildcard(_schema);
auto previous_item = describe_item(_schema, slice, *selection, qr, {});
auto previous_item = describe_item(_schema, slice, *selection, *qr, {});
return apply(std::make_unique<rjson::value>(std::move(previous_item)), ts);
} else {
return apply(std::unique_ptr<rjson::value>(), ts);
@@ -1156,22 +1242,9 @@ rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(sch
const auto& tags = get_tags_of_table(schema);
auto it = tags.find(WRITE_ISOLATION_TAG_KEY);
if (it == tags.end() || it->second.empty()) {
// By default, fall back to always enforcing LWT
return write_isolation::LWT_ALWAYS;
}
switch (it->second[0]) {
case 'f':
return write_isolation::FORBID_RMW;
case 'a':
return write_isolation::LWT_ALWAYS;
case 'o':
return write_isolation::LWT_RMW_ONLY;
case 'u':
return write_isolation::UNSAFE_RMW;
default:
// In case of an incorrect tag, fall back to the safest option: LWT_ALWAYS
return write_isolation::LWT_ALWAYS;
return default_write_isolation;
}
return parse_write_isolation(it->second);
}
// shard_for_execute() checks whether execute() must be called on a specific
@@ -1192,7 +1265,7 @@ std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_
// find the appropriate shard to run it on:
auto token = dht::get_token(*_schema, _pk);
auto desired_shard = service::storage_proxy::cas_shard(*_schema, token);
if (desired_shard == engine().cpu_id()) {
if (desired_shard == this_shard_id()) {
return {};
}
return desired_shard;
@@ -1202,11 +1275,6 @@ std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_
// PutItem, DeleteItem). All these return nothing by default, but can
// optionally return Attributes if requested via the ReturnValues option.
static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes) {
// As an optimization, in the simple and common case that nothing is to be
// returned, quickly return an empty result:
if (attributes.IsNull()) {
return make_ready_future<executor::request_return_type>(json_string(""));
}
rjson::value ret = rjson::empty_object();
if (!attributes.IsNull()) {
rjson::set(ret, "Attributes", std::move(attributes));
@@ -1222,7 +1290,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
stats& stats) {
if (needs_read_before_write) {
if (_write_isolation == write_isolation::FORBID_RMW) {
throw api_error("ValidationException", "Read-modify-write operations not supported");
throw api_error("ValidationException", "Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
}
stats.reads_before_write++;
if (_write_isolation == write_isolation::UNSAFE_RMW) {
@@ -1252,7 +1320,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
auto selection = cql3::selection::selection::wildcard(schema());
auto read_command = needs_read_before_write ?
previous_item_read_command(schema(), _ck, selection) :
read_nothing_read_command(schema());
nullptr;
return proxy.cas(schema(), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
{timeout, std::move(permit), client_state, trace_state},
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout).then([this, read_command] (bool is_applied) mutable {
@@ -1331,7 +1399,7 @@ public:
check_needs_read_before_write(_condition_expression) ||
_returnvalues == returnvalues::ALL_OLD;
}
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
std::unordered_set<std::string> used_attribute_values;
std::unordered_set<std::string> used_attribute_names;
if (!verify_expected(_request, previous_item) ||
@@ -1343,6 +1411,7 @@ public:
// efficient than throwing an exception.
return {};
}
_return_attributes = {};
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
// previous_item is supposed to have been created with
// describe_item(), so has the "Item" attribute:
@@ -1409,7 +1478,7 @@ public:
check_needs_read_before_write(_condition_expression) ||
_returnvalues == returnvalues::ALL_OLD;
}
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
std::unordered_set<std::string> used_attribute_values;
std::unordered_set<std::string> used_attribute_names;
if (!verify_expected(_request, previous_item) ||
@@ -1421,6 +1490,7 @@ public:
// efficient than throwing an exception.
return {};
}
_return_attributes = {};
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
rjson::value* item = rjson::find(*previous_item, "Item");
if (item) {
@@ -1502,9 +1572,9 @@ public:
put_or_delete_item_cas_request(schema_ptr s, std::vector<put_or_delete_item>&& b) :
schema(std::move(s)), _mutation_builders(std::move(b)) { }
virtual ~put_or_delete_item_cas_request() = default;
virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override {
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
std::optional<mutation> ret;
for (put_or_delete_item& mutation_builder : _mutation_builders) {
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
// We assume all these builders have the same partition.
if (ret) {
ret->apply(mutation_builder.build(schema, ts));
@@ -1519,9 +1589,8 @@ public:
static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, dht::decorated_key dk, std::vector<put_or_delete_item>&& mutation_builders,
service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit) {
auto timeout = default_timeout();
auto read_command = read_nothing_read_command(schema);
auto op = seastar::make_shared<put_or_delete_item_cas_request>(schema, std::move(mutation_builders));
return proxy.cas(schema, op, read_command, to_partition_ranges(dk),
return proxy.cas(schema, op, nullptr, to_partition_ranges(dk),
{timeout, std::move(permit), client_state, trace_state},
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
timeout, timeout).discard_result();
@@ -1597,7 +1666,7 @@ static future<> do_batch_write(service::storage_proxy& proxy,
return parallel_for_each(std::move(key_builders), [&proxy, &client_state, &stats, trace_state, ssg, permit = std::move(permit)] (auto& e) {
stats.write_using_lwt++;
auto desired_shard = service::storage_proxy::cas_shard(*e.first.schema, e.first.dk.token());
if (desired_shard == engine().cpu_id()) {
if (desired_shard == this_shard_id()) {
return cas_write(proxy, e.first.schema, e.first.dk, std::move(e.second), client_state, trace_state, permit);
} else {
stats.shard_bounce_for_lwt++;
@@ -2329,7 +2398,7 @@ public:
update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
virtual ~update_item_operation() = default;
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
bool needs_read_before_write() const;
};
@@ -2393,7 +2462,7 @@ update_item_operation::needs_read_before_write() const {
}
std::optional<mutation>
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
std::unordered_set<std::string> used_attribute_values;
std::unordered_set<std::string> used_attribute_names;
if (!verify_expected(_request, previous_item) ||
@@ -2773,6 +2842,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
[] (std::vector<std::tuple<std::string, std::optional<rjson::value>>> responses) {
rjson::value response = rjson::empty_object();
rjson::set(response, "Responses", rjson::empty_object());
rjson::set(response, "UnprocessedKeys", rjson::empty_object());
for (auto& t : responses) {
if (!response["Responses"].HasMember(std::get<0>(t).c_str())) {
rjson::set_with_string_name(response["Responses"], std::get<0>(t), rjson::empty_array());
@@ -2889,6 +2959,7 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
uint32_t limit,
db::consistency_level cl,
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions,
query::partition_slice::option_set custom_opts,
service::client_state& client_state,
cql3::cql_stats& cql_stats,
tracing::trace_state_ptr trace_state,
@@ -2908,8 +2979,12 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
auto regular_columns = boost::copy_range<query::column_id_vector>(
schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
auto static_columns = boost::copy_range<query::column_id_vector>(
schema->static_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
auto selection = cql3::selection::selection::wildcard(schema);
auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), selection->get_query_options());
query::partition_slice::option_set opts = selection->get_query_options();
opts.add(custom_opts);
auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
@@ -2939,11 +3014,38 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
});
}
static dht::token token_for_segment(int segment, int total_segments) {
assert(total_segments > 1 && segment >= 0 && segment < total_segments);
uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
}
static dht::partition_range get_range_for_segment(int segment, int total_segments) {
if (total_segments == 1) {
return dht::partition_range::make_open_ended_both_sides();
}
if (segment == 0) {
dht::token ending_token = token_for_segment(1, total_segments);
return dht::partition_range::make_ending_with(
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false));
} else if (segment == total_segments - 1) {
dht::token starting_token = token_for_segment(segment, total_segments);
return dht::partition_range::make_starting_with(
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)));
} else {
dht::token starting_token = token_for_segment(segment, total_segments);
dht::token ending_token = token_for_segment(segment + 1, total_segments);
return dht::partition_range::make(
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)),
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)
);
}
}
// TODO(sarna):
// 1. Paging must have 1MB boundary according to the docs. IIRC we do have a replica-side reply size limit though - verify.
// 2. Filtering - by passing appropriately created restrictions to pager as a last parameter
// 3. Proper timeouts instead of gc_clock::now() and db::no_timeout
// 4. Implement parallel scanning via Segments
future<executor::request_return_type> executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
_stats.api_operations.scan++;
elogger.trace("Scanning {}", request);
@@ -2954,10 +3056,21 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
return make_ready_future<request_return_type>(api_error("ValidationException",
"FilterExpression is not yet implemented in alternator"));
}
if (get_int_attribute(request, "Segment") || get_int_attribute(request, "TotalSegments")) {
// FIXME: need to support parallel scan. See issue #5059.
return make_ready_future<request_return_type>(api_error("ValidationException",
"Scan Segment/TotalSegments is not yet implemented in alternator"));
auto segment = get_int_attribute(request, "Segment");
auto total_segments = get_int_attribute(request, "TotalSegments");
if (segment || total_segments) {
if (!segment || !total_segments) {
return make_ready_future<request_return_type>(api_error("ValidationException",
"Both Segment and TotalSegments attributes need to be present for a parallel scan"));
}
if (*segment < 0 || *segment >= *total_segments) {
return make_ready_future<request_return_type>(api_error("ValidationException",
"Segment must be non-negative and less than TotalSegments"));
}
if (*total_segments < 0 || *total_segments > 1000000) {
return make_ready_future<request_return_type>(api_error("ValidationException",
"TotalSegments must be non-negative and less or equal to 1000000"));
}
}
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
@@ -2976,7 +3089,12 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
auto attrs_to_get = calculate_attrs_to_get(request);
dht::partition_range_vector partition_ranges{dht::partition_range::make_open_ended_both_sides()};
dht::partition_range_vector partition_ranges;
if (segment) {
partition_ranges.push_back(get_range_for_segment(*segment, *total_segments));
} else {
partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides());
}
std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions;
@@ -2986,14 +3104,15 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
partition_ranges = filtering_restrictions->get_partition_key_ranges(query_options);
ck_bounds = filtering_restrictions->get_clustering_bounds(query_options);
}
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, trace_state, std::move(permit));
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
std::move(filtering_restrictions), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
}
static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, comparison_operator_type op, const rjson::value& attrs) {
if (attrs.Size() != 1) {
throw api_error("ValidationException", format("Only a single attribute is allowed for a hash key restriction: {}", attrs));
}
bytes raw_value = pk_cdef.type->from_string(attrs[0][type_to_string(pk_cdef.type)].GetString());
bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef);
partition_key pk = partition_key::from_singular(*schema, pk_cdef.type->deserialize(raw_value));
auto decorated_key = dht::decorate_key(*schema, pk);
if (op != comparison_operator_type::EQ) {
@@ -3018,7 +3137,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
if (attrs.Size() != expected_attrs_size) {
throw api_error("ValidationException", format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs));
}
bytes raw_value = ck_cdef.type->from_string(attrs[0][type_to_string(ck_cdef.type)].GetString());
bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef);
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
switch (op) {
case comparison_operator_type::EQ:
@@ -3032,7 +3151,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
case comparison_operator_type::GT:
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false));
case comparison_operator_type::BETWEEN: {
bytes raw_upper_limit = ck_cdef.type->from_string(attrs[1][type_to_string(ck_cdef.type)].GetString());
bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef);
clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit);
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit));
}
@@ -3045,9 +3164,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
throw api_error("ValidationException", format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
}
std::string raw_upper_limit_str = attrs[0][type_to_string(ck_cdef.type)].GetString();
bytes raw_upper_limit = ck_cdef.type->from_string(raw_upper_limit_str);
return get_clustering_range_for_begins_with(std::move(raw_upper_limit), ck, schema, ck_cdef.type);
return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
}
default:
throw api_error("ValidationException", format("Unknown primary key bound passed: {}", int(op)));
@@ -3429,11 +3546,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
if (rjson::find(request, "FilterExpression")) {
return make_ready_future<request_return_type>(api_error("ValidationException", "FilterExpression is not yet implemented in alternator"));
}
bool forward = get_bool_attribute(request, "ScanIndexForward", true);
if (!forward) {
// FIXME: need to support the !forward (i.e., reverse sort order) case. See issue #5153.
return make_ready_future<request_return_type>(api_error("ValidationException", "ScanIndexForward=false is not yet implemented in alternator"));
}
const bool forward = get_bool_attribute(request, "ScanIndexForward", true);
rjson::value* key_conditions = rjson::find(request, "KeyConditions");
rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression");
@@ -3476,7 +3589,10 @@ future<executor::request_return_type> executor::query(client_state& client_state
}
verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "KeyConditionExpression");
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "KeyConditionExpression");
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
query::partition_slice::option_set opts;
opts.set_if<query::partition_slice::option::reversed>(!forward);
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
std::move(filtering_restrictions), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
}
future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request) {
@@ -3567,12 +3683,12 @@ static std::map<sstring, sstring> get_network_topology_options(int rf) {
// manually create the keyspace to override this predefined behavior.
future<> executor::create_keyspace(std::string_view keyspace_name) {
sstring keyspace_name_str(keyspace_name);
return gms::get_up_endpoint_count().then([this, keyspace_name_str = std::move(keyspace_name_str)] (int up_endpoint_count) {
return gms::get_all_endpoint_count().then([this, keyspace_name_str = std::move(keyspace_name_str)] (int endpoint_count) {
int rf = 3;
if (up_endpoint_count < rf) {
if (endpoint_count < rf) {
rf = 1;
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} live nodes.",
keyspace_name_str, rf, up_endpoint_count);
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
keyspace_name_str, rf, endpoint_count);
}
auto opts = get_network_topology_options(rf);
auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);

View File

@@ -50,6 +50,7 @@ public:
stats _stats;
static constexpr auto ATTRS_COLUMN_NAME = ":attrs";
static constexpr auto KEYSPACE_NAME_PREFIX = "alternator_";
static constexpr std::string_view INTERNAL_TABLE_PREFIX = ".scylla.alternator.";
executor(service::storage_proxy& proxy, service::migration_manager& mm, smp_service_group ssg)
: _proxy(proxy), _mm(mm), _ssg(ssg) {}

View File

@@ -123,7 +123,7 @@ protected:
std::string print(const rjson::value& value) {
string_buffer buffer;
guarded_yieldable_json_handler<writer, false> writer(buffer, 39);
guarded_yieldable_json_handler<writer, false> writer(buffer, 78);
value.Accept(writer);
return std::string(buffer.GetString());
}
@@ -133,7 +133,7 @@ rjson::value copy(const rjson::value& value) {
}
rjson::value parse(std::string_view str) {
guarded_yieldable_json_handler<document, false> d(39);
guarded_yieldable_json_handler<document, false> d(78);
d.Parse(str.data(), str.size());
if (d.HasParseError()) {
throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));
@@ -143,7 +143,7 @@ rjson::value parse(std::string_view str) {
}
rjson::value parse_yieldable(std::string_view str) {
guarded_yieldable_json_handler<document, true> d(39);
guarded_yieldable_json_handler<document, true> d(78);
d.Parse(str.data(), str.size());
if (d.HasParseError()) {
throw rjson::error(format("Parsing JSON failed: {}", GetParseError_En(d.GetParseError())));

View File

@@ -63,6 +63,10 @@ public:
static write_isolation get_write_isolation_for_schema(schema_ptr schema);
static write_isolation default_write_isolation;
public:
static void set_default_write_isolation(std::string_view mode);
protected:
// The full request JSON
rjson::value _request;
@@ -83,7 +87,11 @@ protected:
// When _returnvalues != NONE, apply() should store here, in JSON form,
// the values which are to be returned in the "Attributes" field.
// The default null JSON means do not return an Attributes field at all.
rjson::value _return_attributes;
// This field is marked "mutable" so that the const apply() can modify
// it (see explanation below), but note that because apply() may be
// called more than once, if apply() will sometimes set this field it
// must set it (even if just to the default empty value) every time.
mutable rjson::value _return_attributes;
public:
// The constructor of a rmw_operation subclass should parse the request
// and try to discover as many input errors as it can before really
@@ -96,9 +104,14 @@ public:
// conditional expression, apply() should return an empty optional.
// apply() may throw if it encounters input errors not discovered during
// the constructor.
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
// apply() may be called more than once in case of contention, so it must
// not change the state saved in the object (issue #7218 was caused by
// violating this). We mark apply() "const" to let the compiler validate
// this for us. The output-only field _return_attributes is marked
// "mutable" above so that apply() can still write to it.
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
// Convert the above apply() into the signature needed by cas_request:
virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override;
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
virtual ~rmw_operation() = default;
schema_ptr schema() const { return _schema; }
const rjson::value& request() const { return _request; }

View File

@@ -121,7 +121,7 @@ struct to_json_visitor {
}
// default
void operator()(const abstract_type& t) const {
rjson::set_with_string_name(deserialized, type_ident, rjson::parse(t.to_string(bytes(bv))));
rjson::set_with_string_name(deserialized, type_ident, rjson::parse(to_json_string(t, bytes(bv))));
}
};
@@ -153,7 +153,9 @@ std::string type_to_string(data_type type) {
};
auto it = types.find(type);
if (it == types.end()) {
throw std::runtime_error(format("Unknown type {}", type->name()));
// fall back to string, in order to be able to present
// internal Scylla types in a human-readable way
return "S";
}
return it->second;
}
@@ -205,8 +207,11 @@ rjson::value json_key_column_value(bytes_view cell, const column_definition& col
auto s = to_json_string(*decimal_type, bytes(cell));
return rjson::from_string(s);
} else {
// We shouldn't get here, we shouldn't see such key columns.
throw std::runtime_error(format("Unexpected key type: {}", column.type->name()));
// Support for arbitrary key types is useful for parsing values of virtual tables,
// which can involve any type supported by Scylla.
// In order to guarantee that the returned type is parsable by alternator clients,
// they are represented simply as strings.
return rjson::from_string(column.type->to_string(bytes(cell)));
}
}

View File

@@ -69,7 +69,7 @@ class api_handler : public handler_base {
public:
api_handler(const std::function<future<executor::request_return_type>(std::unique_ptr<request> req)>& _handle) : _f_handle(
[this, _handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
return seastar::futurize_apply(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
return seastar::futurize_invoke(_handle, std::move(req)).then_wrapped([this, rep = std::move(rep)](future<executor::request_return_type> resf) mutable {
if (resf.failed()) {
// Exceptions of type api_error are wrapped as JSON and
// returned to the client as expected. Other types of
@@ -409,15 +409,19 @@ future<> server::init(net::inet_address addr, std::optional<uint16_t> port, std:
_http_server.set_content_length_limit(server::content_length_limit);
_http_server.listen(socket_address{addr, *port}).get();
_enabled_servers.push_back(std::ref(_http_server));
slogger.info("Alternator HTTP server listening on {} port {}", addr, *port);
}
if (https_port) {
set_routes(_https_server._routes);
_https_server.set_content_length_limit(server::content_length_limit);
_https_server.set_tls_credentials(creds->build_server_credentials());
_https_server.set_tls_credentials(creds->build_reloadable_server_credentials([](const std::unordered_set<sstring>& files, std::exception_ptr ep) {
if (ep) {
slogger.warn("Exception loading {}: {}", files, ep);
} else {
slogger.info("Reloaded {}", files);
}
}).get0());
_https_server.listen(socket_address{addr, *https_port}).get();
_enabled_servers.push_back(std::ref(_https_server));
slogger.info("Alternator HTTPS server listening on {} port {}", addr, *https_port);
}
} catch (...) {
slogger.error("Failed to set up Alternator HTTP server on {} port {}, TLS port {}: {}",

View File

@@ -380,16 +380,54 @@
"operations":[
{
"method":"GET",
"summary":"check if the auto compaction disabled",
"summary":"check if the auto_compaction property is enabled for a given table",
"type":"boolean",
"nickname":"is_auto_compaction_disabled",
"nickname":"get_auto_compaction",
"produces":[
"application/json"
],
"parameters":[
{
"name":"name",
"description":"The column family name in keyspace:name format",
"description":"The table name in keyspace:name format",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"path"
}
]
},
{
"method":"POST",
"summary":"Enable table auto compaction",
"type":"void",
"nickname":"enable_auto_compaction",
"produces":[
"application/json"
],
"parameters":[
{
"name":"name",
"description":"The table name in keyspace:name format",
"required":true,
"allowMultiple":false,
"type":"string",
"paramType":"path"
}
]
},
{
"method":"DELETE",
"summary":"Disable table auto compaction",
"type":"void",
"nickname":"disable_auto_compaction",
"produces":[
"application/json"
],
"parameters":[
{
"name":"name",
"description":"The table name in keyspace:name format",
"required":true,
"allowMultiple":false,
"type":"string",

View File

@@ -208,9 +208,11 @@ void set_cache_service(http_context& ctx, routes& r) {
});
cs::get_row_capacity.set(r, [&ctx] (std::unique_ptr<request> req) {
return map_reduce_cf(ctx, uint64_t(0), [](const column_family& cf) {
return cf.get_row_cache().get_cache_tracker().region().occupancy().used_space();
}, std::plus<uint64_t>());
return ctx.db.map_reduce0([](database& db) -> uint64_t {
return db.row_cache_tracker().region().occupancy().used_space();
}, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
return make_ready_future<json::json_return_type>(res);
});
});
cs::get_row_hits.set(r, [&ctx] (std::unique_ptr<request> req) {
@@ -251,15 +253,19 @@ void set_cache_service(http_context& ctx, routes& r) {
cs::get_row_size.set(r, [&ctx] (std::unique_ptr<request> req) {
// In origin row size is the weighted size.
// We currently do not support weights, so we use num entries instead
return map_reduce_cf(ctx, 0, [](const column_family& cf) {
return cf.get_row_cache().partitions();
}, std::plus<uint64_t>());
return ctx.db.map_reduce0([](database& db) -> uint64_t {
return db.row_cache_tracker().partitions();
}, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
return make_ready_future<json::json_return_type>(res);
});
});
cs::get_row_entries.set(r, [&ctx] (std::unique_ptr<request> req) {
return map_reduce_cf(ctx, 0, [](const column_family& cf) {
return cf.get_row_cache().partitions();
}, std::plus<uint64_t>());
return ctx.db.map_reduce0([](database& db) -> uint64_t {
return db.row_cache_tracker().partitions();
}, uint64_t(0), std::plus<uint64_t>()).then([](const int64_t& res) {
return make_ready_future<json::json_return_type>(res);
});
});
cs::get_counter_capacity.set(r, [] (std::unique_ptr<request> req) {

View File

@@ -804,14 +804,14 @@ void set_column_family(http_context& ctx, routes& r) {
cf::get_cas_propose.set(r, [&ctx] (std::unique_ptr<request> req) {
return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
return cf.get_stats().estimated_cas_propose;
return cf.get_stats().estimated_cas_accept;
},
utils::estimated_histogram_merge, utils_json::estimated_histogram());
});
cf::get_cas_commit.set(r, [&ctx] (std::unique_ptr<request> req) {
return map_reduce_cf(ctx, req->param["name"], utils::estimated_histogram(0), [](column_family& cf) {
return cf.get_stats().estimated_cas_commit;
return cf.get_stats().estimated_cas_learn;
},
utils::estimated_histogram_merge, utils_json::estimated_histogram());
});
@@ -839,11 +839,26 @@ void set_column_family(http_context& ctx, routes& r) {
return make_ready_future<json::json_return_type>(res);
});
cf::is_auto_compaction_disabled.set(r, [] (const_req req) {
// FIXME
// currently auto compaction is disable
// it should be changed when it would have an API
return true;
cf::get_auto_compaction.set(r, [&ctx] (const_req req) {
const utils::UUID& uuid = get_uuid(req.param["name"], ctx.db.local());
column_family& cf = ctx.db.local().find_column_family(uuid);
return !cf.is_auto_compaction_disabled_by_user();
});
cf::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
cf.enable_auto_compaction();
}).then([] {
return make_ready_future<json::json_return_type>(json_void());
});
});
cf::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
return foreach_column_family(ctx, req->param["name"], [](column_family &cf) {
cf.disable_auto_compaction();
}).then([] {
return make_ready_future<json::json_return_type>(json_void());
});
});
cf::get_built_indexes.set(r, [&ctx](std::unique_ptr<request> req) {

View File

@@ -37,8 +37,9 @@ void set_error_injection(http_context& ctx, routes& r) {
sstring injection = req->param["injection"];
bool one_shot = req->get_query_param("one_shot") == "True";
auto& errinj = utils::get_local_injector();
errinj.enable_on_all(injection, one_shot);
return make_ready_future<json::json_return_type>(json::json_void());
return errinj.enable_on_all(injection, one_shot).then([] {
return make_ready_future<json::json_return_type>(json::json_void());
});
});
hf::get_enabled_injections_on_all.set(r, [](std::unique_ptr<request> req) {
@@ -51,14 +52,16 @@ void set_error_injection(http_context& ctx, routes& r) {
sstring injection = req->param["injection"];
auto& errinj = utils::get_local_injector();
errinj.disable_on_all(injection);
return make_ready_future<json::json_return_type>(json::json_void());
return errinj.disable_on_all(injection).then([] {
return make_ready_future<json::json_return_type>(json::json_void());
});
});
hf::disable_on_all.set(r, [](std::unique_ptr<request> req) {
auto& errinj = utils::get_local_injector();
errinj.disable_on_all();
return make_ready_future<json::json_return_type>(json::json_void());
return errinj.disable_on_all().then([] {
return make_ready_future<json::json_return_type>(json::json_void());
});
});
}

View File

@@ -54,26 +54,22 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
}
static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
std::vector<ss::token_range> res;
for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
ss::token_range r;
r.start_token = d._start_token;
r.end_token = d._end_token;
r.endpoints = d._endpoints;
r.rpc_endpoints = d._rpc_endpoints;
for (auto det : d._endpoint_details) {
ss::endpoint_detail ed;
ed.host = det._host;
ed.datacenter = det._datacenter;
if (det._rack != "") {
ed.rack = det._rack;
}
r.endpoint_details.push(ed);
static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
ss::token_range r;
r.start_token = d._start_token;
r.end_token = d._end_token;
r.endpoints = d._endpoints;
r.rpc_endpoints = d._rpc_endpoints;
for (auto det : d._endpoint_details) {
ss::endpoint_detail ed;
ed.host = det._host;
ed.datacenter = det._datacenter;
if (det._rack != "") {
ed.rack = det._rack;
}
res.push_back(r);
r.endpoint_details.push(ed);
}
return res;
return r;
}
using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
@@ -89,6 +85,23 @@ static auto wrap_ks_cf(http_context &ctx, ks_cf_func f) {
};
}
future<> set_tables_autocompaction(http_context& ctx, const sstring &keyspace, std::vector<sstring> tables, bool enabled) {
if (tables.empty()) {
tables = map_keys(ctx.db.local().find_keyspace(keyspace).metadata().get()->cf_meta_data());
}
return ctx.db.invoke_on_all([keyspace, tables, enabled] (database& db) {
return parallel_for_each(tables, [&db, keyspace, enabled](const sstring& table) mutable {
column_family& cf = db.find_column_family(keyspace, table);
if (enabled) {
cf.enable_auto_compaction();
} else {
cf.disable_auto_compaction();
}
return make_ready_future<>();
});
});
}
void set_storage_service(http_context& ctx, routes& r) {
ss::local_hostid.set(r, [](std::unique_ptr<request> req) {
return db::system_keyspace::get_local_host_id().then([](const utils::UUID& id) {
@@ -175,13 +188,13 @@ void set_storage_service(http_context& ctx, routes& r) {
return make_ready_future<json::json_return_type>(res);
});
ss::describe_any_ring.set(r, [&ctx](const_req req) {
return describe_ring("");
ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
});
ss::describe_ring.set(r, [&ctx](const_req req) {
auto keyspace = validate_keyspace(ctx, req.param);
return describe_ring(keyspace);
ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
auto keyspace = validate_keyspace(ctx, req->param);
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
});
ss::get_host_id_map.set(r, [&ctx](const_req req) {
@@ -256,8 +269,8 @@ void set_storage_service(http_context& ctx, routes& r) {
for (auto cf : column_families) {
column_families_vec.push_back(&db.find_column_family(keyspace, cf));
}
return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
return cm.perform_cleanup(cf);
return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
return cm.perform_cleanup(db, cf);
});
}).then([]{
return make_ready_future<json::json_return_type>(0);
@@ -648,7 +661,7 @@ void set_storage_service(http_context& ctx, routes& r) {
ss::set_trace_probability.set(r, [](std::unique_ptr<request> req) {
auto probability = req->get_query_param("probability");
return futurize<json::json_return_type>::apply([probability] {
return futurize_invoke([probability] {
double real_prob = std::stod(probability.c_str());
return tracing::tracing::tracing_instance().invoke_on_all([real_prob] (auto& local_tracing) {
local_tracing.set_trace_probability(real_prob);
@@ -703,19 +716,19 @@ void set_storage_service(http_context& ctx, routes& r) {
});
ss::enable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
//TBD
unimplemented();
auto keyspace = validate_keyspace(ctx, req->param);
auto column_family = req->get_query_param("cf");
return make_ready_future<json::json_return_type>(json_void());
auto tables = split_cf(req->get_query_param("cf"));
return set_tables_autocompaction(ctx, keyspace, tables, true).then([]{
return make_ready_future<json::json_return_type>(json_void());
});
});
ss::disable_auto_compaction.set(r, [&ctx](std::unique_ptr<request> req) {
//TBD
unimplemented();
auto keyspace = validate_keyspace(ctx, req->param);
auto column_family = req->get_query_param("cf");
return make_ready_future<json::json_return_type>(json_void());
auto tables = split_cf(req->get_query_param("cf"));
return set_tables_autocompaction(ctx, keyspace, tables, false).then([]{
return make_ready_future<json::json_return_type>(json_void());
});
});
ss::deliver_hints.set(r, [](std::unique_ptr<request> req) {
@@ -1000,6 +1013,9 @@ void set_snapshot(http_context& ctx, routes& r) {
if (column_family.empty()) {
resp = service::get_local_storage_service().take_snapshot(tag, keynames);
} else {
if (keynames.empty()) {
throw httpd::bad_param_exception("The keyspace of column families must be specified");
}
if (keynames.size() > 1) {
throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
}

View File

@@ -22,6 +22,7 @@
#include "api/api-doc/system.json.hh"
#include "api/api.hh"
#include <seastar/core/reactor.hh>
#include <seastar/http/exception.hh>
#include "log.hh"

View File

@@ -65,16 +65,16 @@ static future<> create_metadata_table_if_missing_impl(
std::string_view cql,
::service::migration_manager& mm) {
static auto ignore_existing = [] (seastar::noncopyable_function<future<>()> func) {
return futurize_apply(std::move(func)).handle_exception_type([] (exceptions::already_exists_exception& ignored) { });
return futurize_invoke(std::move(func)).handle_exception_type([] (exceptions::already_exists_exception& ignored) { });
};
auto& db = qp.db();
auto parsed_statement = static_pointer_cast<cql3::statements::raw::cf_statement>(
cql3::query_processor::parse_statement(cql));
auto parsed_statement = cql3::query_processor::parse_statement(cql);
auto& parsed_cf_statement = static_cast<cql3::statements::raw::cf_statement&>(*parsed_statement);
parsed_statement->prepare_keyspace(meta::AUTH_KS);
parsed_cf_statement.prepare_keyspace(meta::AUTH_KS);
auto statement = static_pointer_cast<cql3::statements::create_table_statement>(
parsed_statement->prepare(db, qp.get_cql_stats())->statement);
parsed_cf_statement.prepare(db, qp.get_cql_stats())->statement);
const auto schema = statement->get_cf_meta_data(qp.db());
const auto uuid = generate_legacy_id(schema->ks_name(), schema->cf_name());
@@ -92,7 +92,7 @@ future<> create_metadata_table_if_missing(
cql3::query_processor& qp,
std::string_view cql,
::service::migration_manager& mm) noexcept {
return futurize_apply(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
return futurize_invoke(create_metadata_table_if_missing_impl, table_name, qp, cql, mm);
}
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {

View File

@@ -27,9 +27,10 @@
#include <seastar/core/future.hh>
#include <seastar/core/abort_source.hh>
#include <seastar/util/noncopyable_function.hh>
#include <seastar/core/reactor.hh>
#include <seastar/core/seastar.hh>
#include <seastar/core/resource.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/smp.hh>
#include "log.hh"
#include "seastarx.hh"
@@ -61,7 +62,7 @@ extern const sstring AUTH_PACKAGE_NAME;
template <class Task>
future<> once_among_shards(Task&& f) {
if (engine().cpu_id() == 0u) {
if (this_shard_id() == 0u) {
return f();
}

View File

@@ -51,7 +51,7 @@ extern "C" {
#include <boost/algorithm/string/join.hpp>
#include <boost/range.hpp>
#include <seastar/core/reactor.hh>
#include <seastar/core/seastar.hh>
#include "auth/authenticated_user.hh"
#include "auth/common.hh"

View File

@@ -48,7 +48,7 @@
#include <optional>
#include <boost/algorithm/cxx11/all_of.hpp>
#include <seastar/core/reactor.hh>
#include <seastar/core/seastar.hh>
#include "auth/authenticated_user.hh"
#include "auth/common.hh"
@@ -230,7 +230,7 @@ future<authenticated_user> password_authenticator::authenticate(
// obsolete prepared statements pretty quickly.
// Rely on query processing caching statements instead, and lets assume
// that a map lookup string->statement is not gonna kill us much.
return futurize_apply([this, username, password] {
return futurize_invoke([this, username, password] {
static const sstring query = format("SELECT {} FROM {} WHERE {} = ?",
SALTED_HASH,
meta::roles_table::qualified_name(),

View File

@@ -33,6 +33,7 @@
#include "auth/resource.hh"
#include "seastarx.hh"
#include "exceptions/exceptions.hh"
namespace auth {
@@ -52,9 +53,9 @@ struct role_config_update final {
///
/// A logical argument error for a role-management operation.
///
class roles_argument_exception : public std::invalid_argument {
class roles_argument_exception : public exceptions::invalid_request_exception {
public:
using std::invalid_argument::invalid_argument;
using exceptions::invalid_request_exception::invalid_request_exception;
};
class role_already_exists : public roles_argument_exception {

View File

@@ -419,7 +419,7 @@ future<> create_role(
return make_ready_future<>();
}
return futurize_apply(
return futurize_invoke(
&validate_authentication_options_are_supported,
options,
ser.underlying_authenticator().supported_options()).then([&ser, name, &options] {
@@ -443,7 +443,7 @@ future<> alter_role(
return make_ready_future<>();
}
return futurize_apply(
return futurize_invoke(
&validate_authentication_options_are_supported,
options,
ser.underlying_authenticator().supported_options()).then([&ser, name, &options] {

View File

@@ -158,7 +158,7 @@ public:
}
virtual future<authenticated_user> get_authenticated_user() const {
return futurize_apply([this] {
return futurize_invoke([this] {
return _sasl->get_authenticated_user().handle_exception([](auto ep) {
try {
std::rethrow_exception(ep);

View File

@@ -176,7 +176,7 @@ public:
return make_ready_future<>();
}
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
throw std::bad_function_call();
return make_exception_future<>(make_backtraced_exception_ptr<std::bad_function_call>());
}
};

65
cdc/cdc_partitioner.cc Normal file
View File

@@ -0,0 +1,65 @@
/*
* Copyright (C) 2020 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include "cdc_partitioner.hh"
#include "dht/token.hh"
#include "schema.hh"
#include "sstables/key.hh"
#include "utils/class_registrator.hh"
#include "cdc/generation.hh"
#include "keys.hh"
static const sstring cdc_partitioner_name = "com.scylladb.dht.CDCPartitioner";
namespace cdc {
const sstring cdc_partitioner::name() const {
return cdc_partitioner_name;
}
static dht::token to_token(int64_t value) {
return dht::token(dht::token::kind::key, value);
}
static dht::token to_token(bytes_view key) {
// Key should be 16 B long, of which first 8 B are used for token calculation
if (key.size() != 2*sizeof(int64_t)) {
return dht::minimum_token();
}
return to_token(stream_id::token_from_bytes(key));
}
dht::token
cdc_partitioner::get_token(const sstables::key_view& key) const {
return to_token(bytes_view(key));
}
dht::token
cdc_partitioner::get_token(const schema& s, partition_key_view key) const {
auto exploded_key = key.explode(s);
return to_token(exploded_key[0]);
}
using registry = class_registrator<dht::i_partitioner, cdc_partitioner>;
static registry registrator(cdc_partitioner_name);
static registry registrator_short_name("CDCPartitioner");
}

48
cdc/cdc_partitioner.hh Normal file
View File

@@ -0,0 +1,48 @@
/*
* Copyright (C) 2020 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <seastar/core/sstring.hh>
#include "bytes.hh"
#include "dht/i_partitioner.hh"
class schema;
class partition_key_view;
namespace sstables {
class key_view;
}
namespace cdc {
struct cdc_partitioner final : public dht::i_partitioner {
cdc_partitioner() = default;
virtual const sstring name() const override;
virtual dht::token get_token(const schema& s, partition_key_view key) const override;
virtual dht::token get_token(const sstables::key_view& key) const override;
};
}

View File

@@ -80,7 +80,7 @@ bool stream_id::operator<(const stream_id& o) const {
return _value < o._value;
}
static int64_t bytes_to_int64(const bytes& b, size_t offset) {
static int64_t bytes_to_int64(bytes_view b, size_t offset) {
assert(b.size() >= offset + sizeof(int64_t));
int64_t res;
std::copy_n(b.begin() + offset, sizeof(int64_t), reinterpret_cast<int8_t *>(&res));
@@ -88,13 +88,17 @@ static int64_t bytes_to_int64(const bytes& b, size_t offset) {
}
int64_t stream_id::first() const {
return bytes_to_int64(_value, 0);
return token_from_bytes(_value);
}
int64_t stream_id::second() const {
return bytes_to_int64(_value, sizeof(int64_t));
}
int64_t stream_id::token_from_bytes(bytes_view b) {
return bytes_to_int64(b, 0);
}
const bytes& stream_id::to_bytes() const {
return _value;
}
@@ -119,176 +123,110 @@ const std::vector<token_range_description>& topology_description::entries() cons
return _entries;
}
static stream_id make_random_stream_id() {
static stream_id create_stream_id(dht::token t) {
static thread_local std::mt19937_64 rand_gen(std::random_device().operator()());
static thread_local std::uniform_int_distribution<int64_t> rand_dist(std::numeric_limits<int64_t>::min());
return {rand_dist(rand_gen), rand_dist(rand_gen)};
return {dht::token::to_int64(t), rand_dist(rand_gen)};
}
/* Given:
* 1. a set of tokens which split the token ring into token ranges (vnodes),
* 2. information on how each token range is distributed among its owning node's shards
* this function tries to generate a set of CDC stream identifiers such that for each
* shard and vnode pair there exists a stream whose token falls into this
* vnode and is owned by this shard.
*
* It then builds a cdc::topology_description which maps tokens to these
* found stream identifiers, such that if token T is owned by shard S in vnode V,
* it gets mapped to the stream identifier generated for (S, V).
*/
// Run in seastar::async context.
topology_description generate_topology_description(
const db::config& cfg,
const std::unordered_set<dht::token>& bootstrap_tokens,
const locator::token_metadata& token_metadata,
const gms::gossiper& gossiper) {
if (bootstrap_tokens.empty()) {
throw std::runtime_error(
"cdc: bootstrap tokens is empty in generate_topology_description");
class topology_description_generator final {
const db::config& _cfg;
const std::unordered_set<dht::token>& _bootstrap_tokens;
const locator::token_metadata& _token_metadata;
const gms::gossiper& _gossiper;
// Compute a set of tokens that split the token ring into vnodes
auto get_tokens() const {
auto tokens = _token_metadata.sorted_tokens();
auto it = tokens.insert(
tokens.end(), _bootstrap_tokens.begin(), _bootstrap_tokens.end());
std::sort(it, tokens.end());
std::inplace_merge(tokens.begin(), it, tokens.end());
tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
return tokens;
}
auto tokens = token_metadata.sorted_tokens();
tokens.insert(tokens.end(), bootstrap_tokens.begin(), bootstrap_tokens.end());
std::sort(tokens.begin(), tokens.end());
tokens.erase(std::unique(tokens.begin(), tokens.end()), tokens.end());
std::vector<token_range_description> entries(tokens.size());
int spots_to_fill = 0;
for (size_t i = 0; i < tokens.size(); ++i) {
auto& entry = entries[i];
entry.token_range_end = tokens[i];
if (bootstrap_tokens.count(entry.token_range_end) > 0) {
entry.streams.resize(smp::count);
entry.sharding_ignore_msb = cfg.murmur3_partitioner_ignore_msb_bits();
// Fetch sharding parameters for a node that owns vnode ending with this.end
// Returns <shard_count, ignore_msb> pair.
std::pair<size_t, uint8_t> get_sharding_info(dht::token end) const {
if (_bootstrap_tokens.count(end) > 0) {
return {smp::count, _cfg.murmur3_partitioner_ignore_msb_bits()};
} else {
auto endpoint = token_metadata.get_endpoint(entry.token_range_end);
auto endpoint = _token_metadata.get_endpoint(end);
if (!endpoint) {
throw std::runtime_error(format("Can't find endpoint for token {}", entry.token_range_end));
}
auto sc = get_shard_count(*endpoint, gossiper);
entry.streams.resize(sc > 0 ? sc : 1);
entry.sharding_ignore_msb = get_sharding_ignore_msb(*endpoint, gossiper);
}
spots_to_fill += entry.streams.size();
}
auto schema = schema_builder("fake_ks", "fake_table")
.with_column("stream_id", bytes_type, column_kind::partition_key)
.build();
auto quota = std::chrono::seconds(spots_to_fill / 2000 + 1);
auto start_time = std::chrono::system_clock::now();
// For each pair (i, j), 0 <= i < streams.size(), 0 <= j < streams[i].size(),
// try to find a stream (stream[i][j]) such that the token of this stream will get mapped to this stream
// (refer to the comments above topology_description's definition to understand how it describes the mapping).
// We find the streams by randomly generating them and checking into which pairs they get mapped.
// NOTE: this algorithm is temporary and will be replaced after per-table-partitioner feature gets merged in.
repeat([&] {
for (int i = 0; i < 500; ++i) {
auto stream_id = make_random_stream_id();
auto token = dht::get_token(*schema, stream_id.to_partition_key(*schema));
// Find the token range into which our stream_id's token landed.
auto it = std::lower_bound(tokens.begin(), tokens.end(), token);
auto& entry = entries[it != tokens.end() ? std::distance(tokens.begin(), it) : 0];
auto shard_id = dht::shard_of(entry.streams.size(), entry.sharding_ignore_msb, token);
assert(shard_id < entry.streams.size());
if (!entry.streams[shard_id].is_set()) {
--spots_to_fill;
entry.streams[shard_id] = stream_id;
}
}
if (!spots_to_fill) {
return stop_iteration::yes;
}
auto now = std::chrono::system_clock::now();
auto passed = std::chrono::duration_cast<std::chrono::seconds>(now - start_time);
if (passed > quota) {
return stop_iteration::yes;
}
return stop_iteration::no;
}).get();
if (spots_to_fill) {
// We were not able to generate stream ids for each (token range, shard) pair.
// For each range that has a stream, for each shard for this range that doesn't have a stream,
// use the stream id of the next shard for this range.
// For each range that doesn't have any stream,
// use streams of the first range to the left which does have a stream.
cdc_log.warn("Generation of CDC streams failed to create streams for some (vnode, shard) pair."
" This can lead to worse performance.");
stream_id some_stream;
size_t idx = 0;
for (; idx < entries.size(); ++idx) {
for (auto s: entries[idx].streams) {
if (s.is_set()) {
some_stream = s;
break;
}
}
if (some_stream.is_set()) {
break;
}
}
assert(idx != entries.size() && some_stream.is_set());
// Iterate over all ranges in the clockwise direction, starting with the one we found a stream for.
for (size_t off = 0; off < entries.size(); ++off) {
auto& ss = entries[(idx + off) % entries.size()].streams;
int last_set_stream_idx = ss.size() - 1;
while (last_set_stream_idx > -1 && !ss[last_set_stream_idx].is_set()) {
--last_set_stream_idx;
}
if (last_set_stream_idx == -1) {
cdc_log.warn(
"CDC wasn't able to generate any stream for vnode ({}, {}]. We'll use another vnode's streams"
" instead. This might lead to inconsistencies.",
tokens[(idx + off + entries.size() - 1) % entries.size()], tokens[(idx + off) % entries.size()]);
ss[0] = some_stream;
last_set_stream_idx = 0;
}
some_stream = ss[last_set_stream_idx];
// Replace 'unset' stream ids with indexes below last_set_stream_idx
for (int s_idx = last_set_stream_idx - 1; s_idx > -1; --s_idx) {
if (ss[s_idx].is_set()) {
some_stream = ss[s_idx];
} else {
ss[s_idx] = some_stream;
}
}
// Replace 'unset' stream ids with indexes above last_set_stream_idx
for (int s_idx = ss.size() - 1; s_idx > last_set_stream_idx; --s_idx) {
if (ss[s_idx].is_set()) {
some_stream = ss[s_idx];
} else {
ss[s_idx] = some_stream;
}
throw std::runtime_error(
format("Can't find endpoint for token {}", end));
}
auto sc = get_shard_count(*endpoint, _gossiper);
return {sc > 0 ? sc : 1, get_sharding_ignore_msb(*endpoint, _gossiper)};
}
}
return {std::move(entries)};
}
token_range_description create_description(dht::token start, dht::token end) const {
token_range_description desc;
desc.token_range_end = end;
auto [shard_count, ignore_msb] = get_sharding_info(end);
desc.streams.reserve(shard_count);
desc.sharding_ignore_msb = ignore_msb;
dht::sharder sharder(shard_count, ignore_msb);
for (size_t shard_idx = 0; shard_idx < shard_count; ++shard_idx) {
auto t = dht::find_first_token_for_shard(sharder, start, end, shard_idx);
desc.streams.push_back(create_stream_id(t));
}
return desc;
}
public:
topology_description_generator(
const db::config& cfg,
const std::unordered_set<dht::token>& bootstrap_tokens,
const locator::token_metadata& token_metadata,
const gms::gossiper& gossiper)
: _cfg(cfg)
, _bootstrap_tokens(bootstrap_tokens)
, _token_metadata(token_metadata)
, _gossiper(gossiper)
{
if (_bootstrap_tokens.empty()) {
throw std::runtime_error(
"cdc: bootstrap tokens is empty in generate_topology_description");
}
}
/*
* Generate a set of CDC stream identifiers such that for each shard
* and vnode pair there exists a stream whose token falls into this vnode
* and is owned by this shard. It is sometimes not possible to generate
* a CDC stream identifier for some (vnode, shard) pair because not all
* shards have to own tokens in a vnode. Small vnode can be totally owned
* by a single shard. In such case, a stream identifier that maps to
* end of the vnode is generated.
*
* Then build a cdc::topology_description which maps tokens to generated
* stream identifiers, such that if token T is owned by shard S in vnode V,
* it gets mapped to the stream identifier generated for (S, V).
*/
// Run in seastar::async context.
topology_description generate() const {
const auto tokens = get_tokens();
std::vector<token_range_description> vnode_descriptions;
vnode_descriptions.reserve(tokens.size());
vnode_descriptions.push_back(
create_description(tokens.back(), tokens.front()));
for (size_t idx = 1; idx < tokens.size(); ++idx) {
vnode_descriptions.push_back(
create_description(tokens[idx - 1], tokens[idx]));
}
return {std::move(vnode_descriptions)};
}
};
bool should_propose_first_generation(const gms::inet_address& me, const gms::gossiper& g) {
auto my_host_id = g.get_host_id(me);
@@ -321,7 +259,7 @@ db_clock::time_point make_new_cdc_generation(
bool for_testing) {
assert(!bootstrap_tokens.empty());
auto gen = generate_topology_description(cfg, bootstrap_tokens, tm, g);
auto gen = topology_description_generator(cfg, bootstrap_tokens, tm, g).generate();
// Begin the race.
auto ts = db_clock::now() + (
@@ -335,12 +273,7 @@ db_clock::time_point make_new_cdc_generation(
std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper& g) {
auto streams_ts_string = g.get_application_state_value(endpoint, gms::application_state::CDC_STREAMS_TIMESTAMP);
cdc_log.trace("endpoint={}, streams_ts_string={}", endpoint, streams_ts_string);
if (streams_ts_string.empty()) {
return {};
}
return db_clock::time_point(db_clock::duration(std::stoll(streams_ts_string)));
return gms::versioned_value::cdc_streams_timestamp_from_string(streams_ts_string);
}
// Run inside seastar::async context.

View File

@@ -77,6 +77,7 @@ public:
const bytes& to_bytes() const;
partition_key to_partition_key(const schema& log_schema) const;
static int64_t token_from_bytes(bytes_view);
};
/* Describes a mapping of tokens to CDC streams in a token range.
@@ -129,7 +130,7 @@ bool should_propose_first_generation(const gms::inet_address& me, const gms::gos
*/
future<db_clock::time_point> get_local_streams_timestamp();
/* Generate a new set of CDC streams and insert it into the distributed cdc_topology_description table.
/* Generate a new set of CDC streams and insert it into the distributed cdc_generations table.
* Returns the timestamp of this new generation.
*
* Should be called when starting the node for the first time (i.e., joining the ring).
@@ -158,9 +159,9 @@ db_clock::time_point make_new_cdc_generation(
std::optional<db_clock::time_point> get_streams_timestamp_for(const gms::inet_address& endpoint, const gms::gossiper&);
/* Inform CDC users about a generation of streams (identified by the given timestamp)
* by inserting it into the cdc_description table.
* by inserting it into the cdc_streams table.
*
* Assumes that the cdc_topology_description table contains this generation.
* Assumes that the cdc_generations table contains this generation.
*
* Returning from this function does not mean that the table update was successful: the function
* might run an asynchronous task in the background.

View File

@@ -239,7 +239,8 @@ public:
future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>> augment_mutation_call(
lowres_clock::time_point timeout,
std::vector<mutation>&& mutations,
tracing::trace_state_ptr tr_state
tracing::trace_state_ptr tr_state,
db::consistency_level write_cl
);
template<typename Iter>
@@ -390,6 +391,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
schema_builder b(s.ks_name(), log_name(s.cf_name()));
b.with_partitioner("com.scylladb.dht.CDCPartitioner");
b.set_comment(sprint("CDC log for %s.%s", s.ks_name(), s.cf_name()));
b.with_column(log_meta_column_name_bytes("stream_id"), bytes_type, column_kind::partition_key);
b.with_column(log_meta_column_name_bytes("time"), timeuuid_type, column_kind::clustering_key);
@@ -399,9 +401,9 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
auto add_columns = [&] (const schema::const_iterator_range_type& columns, bool is_data_col = false) {
for (const auto& column : columns) {
auto type = column.type;
if (is_data_col) {
if (is_data_col && type->is_multi_cell()) {
type = visit(*type, make_visitor(
// lists are represented as map<timeuuid, value_type>. Otherwise we cannot express delta
// non-frozen lists are represented as map<timeuuid, value_type>. Otherwise we cannot express delta
[] (const list_type_impl& type) -> data_type {
return map_type_impl::get_instance(type.name_comparator(), type.value_comparator(), false);
},
@@ -410,7 +412,6 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
return type.freeze();
}
));
type = type->freeze();
}
b.with_column(log_data_column_name_bytes(column.name()), type);
if (is_data_col) {
@@ -715,6 +716,19 @@ private:
const column_definition& _op_col;
const column_definition& _ttl_col;
ttl_opt _cdc_ttl_opt;
/**
* #6070
* When mutation splitting was added, non-atomic column assignments were broken
* into two invocation of transform. This means the second (actual data assignment)
* does not know about the tombstone in first one -> postimage is created as if
* we were _adding_ to the collection, not replacing it.
*
* Not pretty, but to handle this we use the knowledge that we always get
* invoked in timestamp order -> tombstone first, then assign.
* So we simply keep track of non-atomic columns deleted across calls
* and filter out preimage data post this.
*/
std::unordered_set<const column_definition*> _non_atomic_column_deletes;
clustering_key set_pk_columns(const partition_key& pk, api::timestamp_type ts, bytes decomposed_tuuid, int batch_no, mutation& m) const {
const auto log_ck = clustering_key::from_exploded(
@@ -816,18 +830,18 @@ public:
// TODO: is pre-image data based on query enough. We only have actual column data. Do we need
// more details like tombstones/ttl? Probably not but keep in mind.
std::tuple<mutation, stats::part_type_set> transform(const mutation& m, const cql3::untyped_result_set* rs, api::timestamp_type ts, bytes tuuid, int& batch_no) const {
std::tuple<mutation, stats::part_type_set> transform(const mutation& m, const cql3::untyped_result_set* rs, api::timestamp_type ts, bytes tuuid, int& batch_no) {
auto stream_id = _ctx._cdc_metadata.get_stream(ts, m.token());
mutation res(_log_schema, stream_id.to_partition_key(*_log_schema));
const auto preimage = _schema->cdc_options().preimage();
const auto postimage = _schema->cdc_options().postimage();
stats::part_type_set touched_parts;
auto& p = m.partition();
if (p.partition_tombstone()) {
// Partition deletion
touched_parts.set<stats::part_type::PARTITION_DELETE>();
auto log_ck = set_pk_columns(m.key(), ts, tuuid, 0, res);
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_operation(log_ck, ts, operation::partition_delete, res);
++batch_no;
} else if (!p.row_tombstones().empty()) {
// range deletion
touched_parts.set<stats::part_type::RANGE_TOMBSTONE>();
@@ -849,37 +863,30 @@ public:
}
};
{
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no, res);
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_bound(log_ck, rt.start);
const auto start_operation = rt.start_kind == bound_kind::incl_start
? operation::range_delete_start_inclusive
: operation::range_delete_start_exclusive;
set_operation(log_ck, ts, start_operation, res);
++batch_no;
}
{
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no, res);
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_bound(log_ck, rt.end);
const auto end_operation = rt.end_kind == bound_kind::incl_end
? operation::range_delete_end_inclusive
: operation::range_delete_end_exclusive;
set_operation(log_ck, ts, end_operation, res);
++batch_no;
}
}
} else {
// should be insert, update or deletion
auto process_cells = [&](const row& r, column_kind ckind, const clustering_key& log_ck, std::optional<clustering_key> pikey, const cql3::untyped_result_set_row* pirow, std::optional<clustering_key> poikey) -> std::optional<gc_clock::duration> {
if (postimage && !poikey) {
poikey = set_pk_columns(m.key(), ts, tuuid, ++batch_no, res);
set_operation(*poikey, ts, operation::post_image, res);
}
std::optional<gc_clock::duration> ttl;
std::unordered_set<column_id> columns_assigned;
r.for_each_cell([&](column_id id, const atomic_cell_or_collection& cell) {
auto& cdef = _schema->column_at(ckind, id);
auto* dst = _log_schema->get_column_definition(log_data_column_name_bytes(cdef.name()));
auto has_pirow = pirow && pirow->has(cdef.name_as_text());
bool is_column_delete = true;
bytes_opt value;
bytes_opt deleted_elements = std::nullopt;
@@ -1000,29 +1007,30 @@ public:
}
}
if (is_column_delete) {
res.set_cell(log_ck, log_data_column_deleted_name_bytes(cdef.name()), data_value(true), ts, _cdc_ttl_opt);
}
if (value) {
res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, ts, *value, _cdc_ttl_opt));
}
bytes_opt prev = get_preimage_col_value(cdef, pirow);
bytes_opt prev;
if (has_pirow) {
prev = get_preimage_col_value(cdef, pirow);
if (prev && pikey) {
assert(std::addressof(res.partition().clustered_row(*_log_schema, *pikey)) != std::addressof(res.partition().clustered_row(*_log_schema, log_ck)));
assert(pikey->explode() != log_ck.explode());
res.set_cell(*pikey, *dst, atomic_cell::make_live(*dst->type, ts, *prev, _cdc_ttl_opt));
}
if (postimage) {
if (is_column_delete) {
res.set_cell(log_ck, log_data_column_deleted_name_bytes(cdef.name()), data_value(true), ts, _cdc_ttl_opt);
if (!cdef.is_atomic()) {
_non_atomic_column_deletes.insert(&cdef);
}
// don't merge with pre-image iff column delete
prev = std::nullopt;
}
if (value) {
res.set_cell(log_ck, *dst, atomic_cell::make_live(*dst->type, ts, *value, _cdc_ttl_opt));
}
if (poikey) {
// keep track of actually assigning this already
columns_assigned.emplace(id);
// don't merge with pre-image iff column delete
if (is_column_delete) {
prev = std::nullopt;
}
if (cdef.is_atomic() && !is_column_delete && value) {
res.set_cell(*poikey, *dst, atomic_cell::make_live(*dst->type, ts, *value, _cdc_ttl_opt));
} else if (!cdef.is_atomic() && (value || (deleted_elements && prev))) {
@@ -1035,10 +1043,10 @@ public:
});
// fill in all columns not already processed. Note that column nulls are also marked.
if (postimage && pirow) {
if (poikey && pirow) {
for (auto& cdef : _schema->columns(ckind)) {
if (!columns_assigned.count(cdef.id)) {
auto v = pirow->get_view_opt(cdef.name_as_text());
auto v = get_preimage_col_value(cdef, pirow);
if (v) {
auto dst = _log_schema->get_column_definition(log_data_column_name_bytes(cdef.name()));
res.set_cell(*poikey, *dst, atomic_cell::make_live(*dst->type, ts, *v, _cdc_ttl_opt));
@@ -1057,16 +1065,18 @@ public:
if (rs && !rs->empty()) {
// For static rows, only one row from the result set is needed
pikey = set_pk_columns(m.key(), ts, tuuid, batch_no, res);
set_operation(*pikey, ts, operation::pre_image, res);
pirow = &rs->front();
++batch_no;
}
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no, res);
if (preimage && pirow) {
pikey = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_operation(*pikey, ts, operation::pre_image, res);
}
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
if (postimage) {
poikey = set_pk_columns(m.key(), ts, tuuid, ++batch_no, res);
poikey = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_operation(*poikey, ts, operation::post_image, res);
}
@@ -1077,7 +1087,6 @@ public:
if (ttl) {
set_ttl(log_ck, ts, *ttl, res);
}
++batch_no;
} else {
touched_parts.set_if<stats::part_type::CLUSTERING_ROW>(!p.clustered_rows().empty());
for (const rows_entry& r : p.clustered_rows()) {
@@ -1098,19 +1107,21 @@ public:
}
}
if (match) {
pikey = set_pk_columns(m.key(), ts, tuuid, batch_no, res);
set_operation(*pikey, ts, operation::pre_image, res);
pirow = &utr;
++batch_no;
break;
}
}
}
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no, res);
if (preimage && pirow) {
pikey = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_operation(*pikey, ts, operation::pre_image, res);
}
auto log_ck = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
if (postimage) {
poikey = set_pk_columns(m.key(), ts, tuuid, ++batch_no, res);
poikey = set_pk_columns(m.key(), ts, tuuid, batch_no++, res);
set_operation(*poikey, ts, operation::post_image, res);
}
@@ -1120,7 +1131,7 @@ public:
auto cdef = _log_schema->get_column_definition(log_data_column_name_bytes(column.name()));
res.set_cell(log_ck, *cdef, atomic_cell::make_live(*column.type, ts, bytes_view(ck_value[pos]), _cdc_ttl_opt));
if (pirow) {
if (pikey) {
assert(pirow->has(column.name_as_text()));
res.set_cell(*pikey, *cdef, atomic_cell::make_live(*column.type, ts, bytes_view(ck_value[pos]), _cdc_ttl_opt));
}
@@ -1135,12 +1146,12 @@ public:
if (r.row().deleted_at()) {
touched_parts.set<stats::part_type::ROW_DELETE>();
cdc_op = operation::row_delete;
if (pirow) {
if (pirow && pikey) {
for (const column_definition& column: _schema->regular_columns()) {
assert(pirow->has(column.name_as_text()));
auto& cdef = *_log_schema->get_column_definition(log_data_column_name_bytes(column.name()));
auto value = get_preimage_col_value(column, pirow);
res.set_cell(*pikey, cdef, atomic_cell::make_live(*column.type, ts, bytes_view(value), _cdc_ttl_opt));
auto value = get_preimage_col_value(column, pirow);
res.set_cell(*pikey, cdef, atomic_cell::make_live(*column.type, ts, bytes_view(*value), _cdc_ttl_opt));
}
}
} else {
@@ -1157,7 +1168,6 @@ public:
}
}
set_operation(log_ck, ts, cdc_op, res);
++batch_no;
}
}
}
@@ -1165,7 +1175,13 @@ public:
return std::make_tuple(std::move(res), touched_parts);
}
static bytes get_preimage_col_value(const column_definition& cdef, const cql3::untyped_result_set_row *pirow) {
bytes_opt get_preimage_col_value(const column_definition& cdef, const cql3::untyped_result_set_row *pirow) {
/**
* #6070 - see comment for _non_atomic_column_deletes
*/
if (!pirow || !pirow->has(cdef.name_as_text()) || _non_atomic_column_deletes.count(&cdef)) {
return std::nullopt;
}
return cdef.is_atomic()
? pirow->get_blob(cdef.name_as_text())
: visit(*cdef.type, make_visitor(
@@ -1194,7 +1210,7 @@ public:
future<lw_shared_ptr<cql3::untyped_result_set>> pre_image_select(
service::client_state& client_state,
db::consistency_level cl,
db::consistency_level write_cl,
const mutation& m)
{
auto& p = m.partition();
@@ -1275,7 +1291,10 @@ public:
auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts));
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, row_limit);
return _ctx._proxy.query(_schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
const auto select_cl = adjust_cl(write_cl);
try {
return _ctx._proxy.query(_schema, std::move(command), std::move(partition_ranges), select_cl, service::storage_proxy::coordinator_query_options(default_timeout(), empty_service_permit(), client_state)).then(
[s = _schema, partition_slice = std::move(partition_slice), selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) -> lw_shared_ptr<cql3::untyped_result_set> {
cql3::selection::result_set_builder builder(*selection, gc_clock::now(), cql_serialization_format::latest());
query::result_view::consume(*qr.query_result, partition_slice, cql3::selection::result_set_builder::visitor(builder, *s, *selection));
@@ -1285,6 +1304,25 @@ public:
}
return make_lw_shared<cql3::untyped_result_set>(*result_set);
});
} catch (exceptions::unavailable_exception& e) {
// `query` can throw `unavailable_exception`, which is seen by clients as ~ "NoHostAvailable".
// So, we'll translate it to a `read_failure_exception` with custom message.
cdc_log.debug("Preimage: translating a (read) `unavailable_exception` to `request_execution_exception` - {}", e);
throw exceptions::read_failure_exception("CDC preimage query could not achieve the CL.",
e.consistency, e.alive, 0, e.required, false);
}
}
/** For preimage query use the same CL as for base write, except for CLs ANY and ALL. */
static db::consistency_level adjust_cl(db::consistency_level write_cl) {
if (write_cl == db::consistency_level::ANY) {
return db::consistency_level::ONE;
} else if (write_cl == db::consistency_level::ALL || write_cl == db::consistency_level::SERIAL) {
return db::consistency_level::QUORUM;
} else if (write_cl == db::consistency_level::LOCAL_SERIAL) {
return db::consistency_level::LOCAL_QUORUM;
}
return write_cl;
}
};
@@ -1300,7 +1338,7 @@ transform_mutations(std::vector<mutation>& muts, decltype(muts.size()) batch_siz
} // namespace cdc
future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state) {
cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
// we do all this because in the case of batches, we can have mixed schemas.
auto e = mutations.end();
auto i = std::find_if(mutations.begin(), e, [](const mutation& m) {
@@ -1315,8 +1353,8 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
mutations.reserve(2 * mutations.size());
return do_with(std::move(mutations), service::query_state(service::client_state::for_internal_calls(), empty_service_permit()), operation_details{},
[this, timeout, i, tr_state = std::move(tr_state)] (std::vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
return transform_mutations(mutations, 1, [this, &mutations, timeout, &qs, tr_state = tr_state, &details] (int idx) mutable {
[this, timeout, i, tr_state = std::move(tr_state), write_cl] (std::vector<mutation>& mutations, service::query_state& qs, operation_details& details) {
return transform_mutations(mutations, 1, [this, &mutations, timeout, &qs, tr_state = tr_state, &details, write_cl] (int idx) mutable {
auto& m = mutations[idx];
auto s = m.schema();
@@ -1332,7 +1370,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
// iff a batch contains several modifications to the same table. Otoh, batch is rare(?)
// so this is premature.
tracing::trace(tr_state, "CDC: Selecting preimage for {}", m.decorated_key());
f = trans.pre_image_select(qs.get_client_state(), db::consistency_level::LOCAL_QUORUM, m).then_wrapped([this] (future<lw_shared_ptr<cql3::untyped_result_set>> f) {
f = trans.pre_image_select(qs.get_client_state(), write_cl, m).then_wrapped([this] (future<lw_shared_ptr<cql3::untyped_result_set>> f) {
auto& cdc_stats = _ctxt._proxy.get_cdc_stats();
cdc_stats.counters_total.preimage_selects++;
if (f.failed()) {
@@ -1344,7 +1382,7 @@ cdc::cdc_service::impl::augment_mutation_call(lowres_clock::time_point timeout,
tracing::trace(tr_state, "CDC: Preimage not enabled for the table, not querying current value of {}", m.decorated_key());
}
return f.then([trans = std::move(trans), &mutations, idx, tr_state = std::move(tr_state), &details] (lw_shared_ptr<cql3::untyped_result_set> rs) {
return f.then([trans = std::move(trans), &mutations, idx, tr_state = std::move(tr_state), &details] (lw_shared_ptr<cql3::untyped_result_set> rs) mutable {
auto& m = mutations[idx];
auto& s = m.schema();
details.had_preimage |= s->cdc_options().preimage();
@@ -1389,6 +1427,6 @@ bool cdc::cdc_service::needs_cdc_augmentation(const std::vector<mutation>& mutat
}
future<std::tuple<std::vector<mutation>, lw_shared_ptr<cdc::operation_result_tracker>>>
cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state) {
return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state));
cdc::cdc_service::augment_mutation_call(lowres_clock::time_point timeout, std::vector<mutation>&& mutations, tracing::trace_state_ptr tr_state, db::consistency_level write_cl) {
return _impl->augment_mutation_call(timeout, std::move(mutations), std::move(tr_state), write_cl);
}

View File

@@ -91,7 +91,8 @@ public:
future<std::tuple<std::vector<mutation>, lw_shared_ptr<operation_result_tracker>>> augment_mutation_call(
lowres_clock::time_point timeout,
std::vector<mutation>&& mutations,
tracing::trace_state_ptr tr_state
tracing::trace_state_ptr tr_state,
db::consistency_level write_cl
);
bool needs_cdc_augmentation(const std::vector<mutation>&) const;
};

View File

@@ -22,7 +22,7 @@
#pragma once
#include "seastar/core/file.hh"
#include "seastar/core/reactor.hh"
#include "seastar/core/seastar.hh"
#include "utils/disk-error-handler.hh"
#include "seastarx.hh"
@@ -147,7 +147,7 @@ inline open_checked_directory(const io_error_handler& error_handler,
sstring name)
{
return do_io_check(error_handler, [&] {
return engine().open_directory(name).then([&] (file f) {
return open_directory(name).then([&] (file f) {
return make_ready_future<file>(make_checked_file(error_handler, f));
});
});

View File

@@ -30,10 +30,12 @@ std::atomic<int64_t> clocks_offset;
std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
auto t = db_clock::to_time_t(tp);
return os << std::put_time(std::gmtime(&t), "%Y/%m/%d %T");
::tm t_buf;
return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
}
std::string format_timestamp(api::timestamp_type ts) {
auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
return format("{}", std::put_time(std::gmtime(&t), "%Y/%m/%d %T"));
::tm t_buf;
return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
}

View File

@@ -140,6 +140,9 @@ public:
uint64_t adjust_partition_estimate(const mutation_source_metadata& ms_meta, uint64_t partition_estimate);
reader_consumer make_interposer_consumer(const mutation_source_metadata& ms_meta, reader_consumer end_consumer);
// Returns whether or not interposer consumer is used by a given strategy.
bool use_interposer_consumer() const;
};
// Creates a compaction_strategy object from one of the strategies available.

View File

@@ -27,6 +27,9 @@
#include "schema.hh"
#include "sstables/version.hh"
//FIXME: de-inline methods and define this as static in a .cc file.
extern logging::logger compound_logger;
//
// This header provides adaptors between the representation used by our compound_type<>
// and representation used by Origin.
@@ -337,8 +340,9 @@ public:
class iterator : public std::iterator<std::input_iterator_tag, const component_view> {
bytes_view _v;
component_view _current;
bool _strict_mode = true;
private:
void read_current() {
void do_read_current() {
size_type len;
{
if (_v.empty()) {
@@ -354,11 +358,23 @@ public:
_v.remove_prefix(len);
_current = component_view(std::move(value), to_eoc(read_simple<eoc_type>(_v)));
}
public:
void read_current() {
try {
do_read_current();
} catch (marshal_exception&) {
if (_strict_mode) {
on_internal_error(compound_logger, std::current_exception());
} else {
throw;
}
}
}
struct end_iterator_tag {};
iterator(const bytes_view& v, bool is_compound, bool is_static)
: _v(v) {
// In strict-mode de-serialization errors will invoke `on_internal_error()`.
iterator(const bytes_view& v, bool is_compound, bool is_static, bool strict_mode = true)
: _v(v), _strict_mode(strict_mode) {
if (is_static) {
_v.remove_prefix(2);
}
@@ -372,6 +388,7 @@ public:
iterator(end_iterator_tag) : _v(nullptr, 0) {}
public:
iterator& operator++() {
read_current();
return *this;
@@ -387,6 +404,9 @@ public:
const value_type* operator->() const { return &_current; }
bool operator!=(const iterator& i) const { return _v.begin() != i._v.begin(); }
bool operator==(const iterator& i) const { return _v.begin() == i._v.begin(); }
friend class composite;
friend class composite_view;
};
iterator begin() const {
@@ -555,6 +575,21 @@ public:
return composite::is_static(_bytes, _is_compound);
}
bool is_valid() const {
try {
auto it = composite::iterator(_bytes, _is_compound, is_static(), false);
const auto end = composite::iterator(composite::iterator::end_iterator_tag());
size_t s = 0;
for (; it != end; ++it) {
auto& c = *it;
s += c.first.size() + sizeof(composite::size_type) + sizeof(composite::eoc_type);
}
return s == _bytes.size();
} catch (marshal_exception&) {
return false;
}
}
explicit operator bytes_view() const {
return _bytes;
}

View File

@@ -253,11 +253,11 @@ modes = {
},
'release': {
'cxxflags': '',
'cxx_ld_flags': '-O3 -Wstack-usage=%s' % (1024*29),
'cxx_ld_flags': '-O3 -Wstack-usage=%s' % (1024*13),
},
'dev': {
'cxxflags': '-DSEASTAR_ENABLE_ALLOC_FAILURE_INJECTION -DSCYLLA_ENABLE_ERROR_INJECTION',
'cxx_ld_flags': '-O1 -Wstack-usage=%s' % (1024*29),
'cxx_ld_flags': '-O1 -Wstack-usage=%s' % (1024*21),
},
'sanitize': {
'cxxflags': '-DDEBUG -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
@@ -381,6 +381,7 @@ scylla_tests = set([
'test/boost/view_schema_ckey_test',
'test/boost/vint_serialization_test',
'test/boost/virtual_reader_test',
'test/boost/stall_free_test',
'test/manual/ec2_snitch_test',
'test/manual/gce_snitch_test',
'test/manual/gossip',
@@ -418,6 +419,7 @@ perf_tests = set([
apps = set([
'scylla',
'test/tools/cql_repl',
'tools/scylla_types',
])
tests = scylla_tests | perf_tests
@@ -438,6 +440,7 @@ arg_parser.add_argument('--so', dest='so', action='store_true',
help='Build shared object (SO) instead of executable')
arg_parser.add_argument('--mode', action='append', choices=list(modes.keys()), dest='selected_modes')
arg_parser.add_argument('--with', dest='artifacts', action='append', choices=all_artifacts, default=[])
arg_parser.add_argument('--with-seastar', action='store', dest='seastar_path', default='seastar', help='Path to Seastar sources')
arg_parser.add_argument('--cflags', action='store', dest='user_cflags', default='',
help='Extra flags for the C++ compiler')
arg_parser.add_argument('--ldflags', action='store', dest='user_ldflags', default='',
@@ -468,8 +471,6 @@ arg_parser.add_argument('--tests-debuginfo', action='store', dest='tests_debugin
help='Enable(1)/disable(0)compiler debug information generation for tests')
arg_parser.add_argument('--python', action='store', dest='python', default='python3',
help='Python3 path')
add_tristate(arg_parser, name='hwloc', dest='hwloc', help='hwloc support')
add_tristate(arg_parser, name='xen', dest='xen', help='Xen support')
arg_parser.add_argument('--split-dwarf', dest='split_dwarf', action='store_true', default=False,
help='use of split dwarf (https://gcc.gnu.org/wiki/DebugFission) to speed up linking')
arg_parser.add_argument('--enable-gcc6-concepts', dest='gcc6_concepts', action='store_true', default=False,
@@ -540,6 +541,7 @@ scylla_core = (['database.cc',
'sstables/compaction_strategy.cc',
'sstables/size_tiered_compaction_strategy.cc',
'sstables/leveled_compaction_strategy.cc',
'sstables/time_window_compaction_strategy.cc',
'sstables/compaction_manager.cc',
'sstables/integrity_checked_file_impl.cc',
'sstables/prepended_input_stream.cc',
@@ -548,6 +550,7 @@ scylla_core = (['database.cc',
'transport/event_notifier.cc',
'transport/server.cc',
'transport/messages/result_message.cc',
'cdc/cdc_partitioner.cc',
'cdc/log.cc',
'cdc/split.cc',
'cdc/generation.cc',
@@ -786,6 +789,7 @@ scylla_core = (['database.cc',
'utils/like_matcher.cc',
'utils/error_injection.cc',
'mutation_writer/timestamp_based_splitting_writer.cc',
'mutation_writer/shard_based_splitting_writer.cc',
'lua.cc',
] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
)
@@ -897,6 +901,7 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci
'test/lib/cql_assertions.cc',
'test/lib/result_set_assertions.cc',
'test/lib/mutation_source_test.cc',
'test/lib/sstable_utils.cc',
'test/lib/data_model.cc',
'test/lib/exception_utils.cc',
'test/lib/random_schema.cc',
@@ -905,6 +910,8 @@ scylla_tests_dependencies = scylla_core + idls + scylla_tests_generic_dependenci
deps = {
'scylla': idls + ['main.cc', 'release.cc', 'build_id.cc'] + scylla_core + api + alternator + redis,
'test/tools/cql_repl': idls + ['test/tools/cql_repl.cc'] + scylla_core + scylla_tests_generic_dependencies,
#FIXME: we don't need all of scylla_core here, only the types module, need to modularize scylla_core.
'tools/scylla_types': idls + ['tools/scylla_types.cc'] + scylla_core,
}
pure_boost_tests = set([
@@ -952,11 +959,9 @@ tests_not_using_seastar_test_framework = set([
'test/perf/perf_hash',
'test/perf/perf_mutation',
'test/perf/perf_row_cache_update',
'test/perf/perf_sstable',
'test/unit/lsa_async_eviction_test',
'test/unit/lsa_sync_eviction_test',
'test/unit/row_cache_alloc_stress_test',
'test/unit/row_cache_stress_test',
'test/manual/sstable_scan_footprint_test',
]) | pure_boost_tests
@@ -978,13 +983,10 @@ perf_tests_seastar_deps = [
for t in perf_tests:
deps[t] = [t + '.cc'] + scylla_tests_dependencies + perf_tests_seastar_deps
deps['test/boost/sstable_test'] += ['test/lib/sstable_utils.cc', 'test/lib/normalizing_reader.cc']
deps['test/boost/sstable_datafile_test'] += ['test/lib/sstable_utils.cc', 'test/lib/normalizing_reader.cc']
deps['test/boost/sstable_resharding_test'] += ['test/lib/sstable_utils.cc' ]
deps['test/boost/mutation_reader_test'] += ['test/lib/sstable_utils.cc', 'test/lib/dummy_partitioner.cc' ]
deps['test/boost/multishard_combining_reader_as_mutation_source_test'] += ['test/lib/sstable_utils.cc', 'test/lib/dummy_partitioner.cc' ]
deps['test/boost/sstable_mutation_test'] += ['test/lib/sstable_utils.cc']
deps['test/boost/sstable_conforms_to_mutation_source_test'] += ['test/lib/sstable_utils.cc']
deps['test/boost/sstable_test'] += ['test/lib/normalizing_reader.cc']
deps['test/boost/sstable_datafile_test'] += ['test/lib/normalizing_reader.cc']
deps['test/boost/mutation_reader_test'] += ['test/lib/dummy_sharder.cc' ]
deps['test/boost/multishard_combining_reader_as_mutation_source_test'] += ['test/lib/dummy_sharder.cc' ]
deps['test/boost/bytes_ostream_test'] = [
"test/boost/bytes_ostream_test.cc",
@@ -1234,11 +1236,11 @@ def configure_seastar(build_dir, mode):
if args.alloc_failure_injector:
seastar_cmake_args += ['-DSeastar_ALLOC_FAILURE_INJECTION=ON']
seastar_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('seastar', seastar_build_dir)] + seastar_cmake_args
seastar_cmd = ['cmake', '-G', 'Ninja', os.path.relpath(args.seastar_path, seastar_build_dir)] + seastar_cmake_args
cmake_dir = seastar_build_dir
if args.dpdk:
# need to cook first
cmake_dir = 'seastar' # required by cooking.sh
cmake_dir = args.seastar_path # required by cooking.sh
relative_seastar_build_dir = os.path.join('..', seastar_build_dir) # relative to seastar/
seastar_cmd = ['./cooking.sh', '-i', 'dpdk', '-d', relative_seastar_build_dir, '--'] + seastar_cmd[4:]
@@ -1265,9 +1267,9 @@ def query_seastar_flags(pc_file, link_static_cxx=False):
return cflags, libs
for mode in build_modes:
seastar_cflags, seastar_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
modes[mode]['seastar_cflags'] = seastar_cflags
modes[mode]['seastar_libs'] = seastar_libs
seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
modes[mode]['seastar_cflags'] = seastar_pc_cflags
modes[mode]['seastar_libs'] = seastar_pc_libs
# We need to use experimental features of the zstd library (to use our own allocators for the (de)compression context),
# which are available only when the library is linked statically.
@@ -1288,16 +1290,58 @@ def configure_zstd(build_dir, mode):
os.makedirs(zstd_build_dir, exist_ok=True)
subprocess.check_call(zstd_cmd, shell=False, cwd=zstd_build_dir)
def configure_abseil(build_dir, mode):
abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
cmake_mode = MODE_TO_CMAKE_BUILD_TYPE[mode]
abseil_cmake_args = [
'-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
'-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
'-DCMAKE_C_COMPILER={}'.format(args.cc),
'-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
'-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
]
abseil_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('abseil', abseil_build_dir)] + abseil_cmake_args
os.makedirs(abseil_build_dir, exist_ok=True)
subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
abseil_libs = ['absl/' + lib for lib in [
'container/libabsl_hashtablez_sampler.a',
'container/libabsl_raw_hash_set.a',
'synchronization/libabsl_synchronization.a',
'synchronization/libabsl_graphcycles_internal.a',
'debugging/libabsl_stacktrace.a',
'debugging/libabsl_symbolize.a',
'debugging/libabsl_debugging_internal.a',
'debugging/libabsl_demangle_internal.a',
'time/libabsl_time.a',
'time/libabsl_time_zone.a',
'numeric/libabsl_int128.a',
'hash/libabsl_city.a',
'hash/libabsl_hash.a',
'base/libabsl_malloc_internal.a',
'base/libabsl_spinlock_wait.a',
'base/libabsl_base.a',
'base/libabsl_dynamic_annotations.a',
'base/libabsl_raw_logging_internal.a',
'base/libabsl_exponential_biased.a',
'base/libabsl_throw_delegate.a']]
args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
args.user_cflags += ' -march=' + args.target
libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
' -lstdc++fs', ' -lcrypt', ' -lcryptopp', ' -lpthread',
maybe_static(args.staticboost, '-lboost_date_time -lboost_regex -licuuc'), ])
xxhash_dir = 'xxHash'
pkgconfig_libs = [
'libxxhash',
]
if not os.path.exists(xxhash_dir) or not os.listdir(xxhash_dir):
raise Exception(xxhash_dir + ' is empty. Run "git submodule update --init".')
args.user_cflags += ' ' + ' '.join([pkg_config(lib, '--cflags') for lib in pkgconfig_libs])
libs += ' ' + ' '.join([pkg_config(lib, '--libs') for lib in pkgconfig_libs])
if not args.staticboost:
args.user_cflags += ' -DBOOST_TEST_DYN_LINK'
@@ -1316,10 +1360,11 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
for pkg in pkgs:
args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
libs += ' ' + pkg_config(pkg, '--libs')
args.user_cflags += '-I abseil'
user_cflags = args.user_cflags + ' -fvisibility=hidden'
user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
if args.staticcxx:
user_ldflags += " -static-libgcc -static-libstdc++"
user_ldflags += " -static-libstdc++"
if args.staticthrift:
thrift_libs = "-Wl,-Bstatic -lthrift -Wl,-Bdynamic"
else:
@@ -1346,6 +1391,9 @@ else:
for mode in build_modes:
configure_zstd(outdir, mode)
for mode in build_modes:
configure_abseil(outdir, mode)
# configure.py may run automatically from an already-existing build.ninja.
# If the user interrupts configure.py in the middle, we need build.ninja
# to remain in a valid state. So we write our output to a temporary
@@ -1369,7 +1417,7 @@ with open(buildfile_tmp, 'w') as f:
command = echo -e $text > $out
description = GEN $out
rule swagger
command = seastar/scripts/seastar-json2code.py -f $in -o $out
command = {args.seastar_path}/scripts/seastar-json2code.py -f $in -o $out
description = SWAGGER $out
rule serializer
command = {python} ./idl-compiler.py --ns ser -f $in -o $out
@@ -1441,9 +1489,12 @@ with open(buildfile_tmp, 'w') as f:
build/{mode}/gen/${{stem}}Parser.cpp
description = ANTLR3 $in
rule checkhh.{mode}
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -x c++ --include=$in -c -o $out /dev/null
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags --include $in -c -o $out build/{mode}/gen/empty.cc
description = CHECKHH $in
depfile = $out.d
rule test.{mode}
command = ./test.py --mode={mode}
description = TEST {mode}
''').format(mode=mode, antlr3_exec=antlr3_exec, fmt_lib=fmt_lib, **modeval))
f.write(
'build {mode}: phony {artifacts}\n'.format(
@@ -1480,6 +1531,8 @@ with open(buildfile_tmp, 'w') as f:
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
'libdeflate/libdeflate.a',
'zstd/lib/libzstd.a',
] + [
'abseil/' + x for x in abseil_libs
]])
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
if binary in tests:
@@ -1543,6 +1596,17 @@ with open(buildfile_tmp, 'w') as f:
)
)
f.write(
'build {mode}-test: test.{mode} {test_executables} $builddir/{mode}/test/tools/cql_repl\n'.format(
mode=mode,
test_executables=' '.join(['$builddir/{}/{}'.format(mode, binary) for binary in tests]),
)
)
f.write(
'build {mode}-check: phony {mode}-headers {mode}-test\n'.format(
mode=mode,
)
)
gen_headers = []
for th in thrifts:
@@ -1561,7 +1625,7 @@ with open(buildfile_tmp, 'w') as f:
f.write(' cxxflags = {seastar_cflags} $cxxflags $cxxflags_{mode} {extra_cxxflags}\n'.format(mode=mode, extra_cxxflags=extra_cxxflags[src], **modeval))
for hh in swaggers:
src = swaggers[hh]
f.write('build {}: swagger {} | seastar/scripts/seastar-json2code.py\n'.format(hh, src))
f.write('build {}: swagger {} | {}/scripts/seastar-json2code.py\n'.format(hh, src, args.seastar_path))
for hh in serializers:
src = serializers[hh]
f.write('build {}: serializer {} | idl-compiler.py\n'.format(hh, src))
@@ -1587,8 +1651,9 @@ with open(buildfile_tmp, 'w') as f:
if has_sanitize_address_use_after_scope:
flags += ' -fno-sanitize-address-use-after-scope'
f.write(' obj_cxxflags = %s\n' % flags)
f.write(f'build build/{mode}/gen/empty.cc: gen\n')
for hh in headers:
f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} || {gen_headers_dep}\n'.format(
f.write('build $builddir/{mode}/{hh}.o: checkhh.{mode} {hh} | build/{mode}/gen/empty.cc || {gen_headers_dep}\n'.format(
mode=mode, hh=hh, gen_headers_dep=gen_headers_dep))
f.write('build build/{mode}/seastar/libseastar.a: ninja | always\n'
@@ -1621,14 +1686,27 @@ with open(buildfile_tmp, 'w') as f:
f.write(' subdir = build/{mode}/zstd\n'.format(**locals()))
f.write(' target = libzstd.a\n'.format(**locals()))
for lib in abseil_libs:
f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
f.write(' pool = submodule_pool\n')
f.write(' subdir = build/{mode}/abseil\n'.format(**locals()))
f.write(' target = {lib}\n'.format(**locals()))
mode = 'dev' if 'dev' in modes else modes[0]
f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))
f.write(
'build test: phony {}\n'.format(' '.join(['{mode}-test'.format(mode=mode) for mode in modes]))
)
f.write(
'build check: phony {}\n'.format(' '.join(['{mode}-check'.format(mode=mode) for mode in modes]))
)
f.write(textwrap.dedent('''\
rule configure
command = {python} configure.py $configure_args
generator = 1
build build.ninja: configure | configure.py SCYLLA-VERSION-GEN seastar/CMakeLists.txt
build build.ninja: configure | configure.py SCYLLA-VERSION-GEN {args.seastar_path}/CMakeLists.txt
rule cscope
command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
description = CSCOPE

View File

@@ -105,7 +105,7 @@ options {
using namespace cql3::statements;
using namespace cql3::selection;
using cql3::cql3_type;
using conditions_type = std::vector<std::pair<::shared_ptr<cql3::column_identifier::raw>,::shared_ptr<cql3::column_condition::raw>>>;
using conditions_type = std::vector<std::pair<::shared_ptr<cql3::column_identifier::raw>,lw_shared_ptr<cql3::column_condition::raw>>>;
using operations_type = std::vector<std::pair<::shared_ptr<cql3::column_identifier::raw>,::shared_ptr<cql3::operation::raw_update>>>;
// ANTLR forces us to define a default-initialized return value
@@ -319,63 +319,63 @@ struct uninitialized {
/** STATEMENTS **/
query returns [shared_ptr<raw::parsed_statement> stmnt]
: st=cqlStatement (';')* EOF { $stmnt = st; }
query returns [std::unique_ptr<raw::parsed_statement> stmnt]
: st=cqlStatement (';')* EOF { $stmnt = std::move(st); }
;
cqlStatement returns [shared_ptr<raw::parsed_statement> stmt]
cqlStatement returns [std::unique_ptr<raw::parsed_statement> stmt]
@after{ if (stmt) { stmt->set_bound_variables(_bind_variables); } }
: st1= selectStatement { $stmt = st1; }
| st2= insertStatement { $stmt = st2; }
| st3= updateStatement { $stmt = st3; }
| st4= batchStatement { $stmt = st4; }
| st5= deleteStatement { $stmt = st5; }
| st6= useStatement { $stmt = st6; }
| st7= truncateStatement { $stmt = st7; }
| st8= createKeyspaceStatement { $stmt = st8; }
| st9= createTableStatement { $stmt = st9; }
| st10=createIndexStatement { $stmt = st10; }
| st11=dropKeyspaceStatement { $stmt = st11; }
| st12=dropTableStatement { $stmt = st12; }
| st13=dropIndexStatement { $stmt = st13; }
| st14=alterTableStatement { $stmt = st14; }
| st15=alterKeyspaceStatement { $stmt = st15; }
| st16=grantStatement { $stmt = st16; }
| st17=revokeStatement { $stmt = st17; }
| st18=listPermissionsStatement { $stmt = st18; }
| st19=createUserStatement { $stmt = st19; }
| st20=alterUserStatement { $stmt = st20; }
| st21=dropUserStatement { $stmt = st21; }
| st22=listUsersStatement { $stmt = st22; }
: st1= selectStatement { $stmt = std::move(st1); }
| st2= insertStatement { $stmt = std::move(st2); }
| st3= updateStatement { $stmt = std::move(st3); }
| st4= batchStatement { $stmt = std::move(st4); }
| st5= deleteStatement { $stmt = std::move(st5); }
| st6= useStatement { $stmt = std::move(st6); }
| st7= truncateStatement { $stmt = std::move(st7); }
| st8= createKeyspaceStatement { $stmt = std::move(st8); }
| st9= createTableStatement { $stmt = std::move(st9); }
| st10=createIndexStatement { $stmt = std::move(st10); }
| st11=dropKeyspaceStatement { $stmt = std::move(st11); }
| st12=dropTableStatement { $stmt = std::move(st12); }
| st13=dropIndexStatement { $stmt = std::move(st13); }
| st14=alterTableStatement { $stmt = std::move(st14); }
| st15=alterKeyspaceStatement { $stmt = std::move(st15); }
| st16=grantStatement { $stmt = std::move(st16); }
| st17=revokeStatement { $stmt = std::move(st17); }
| st18=listPermissionsStatement { $stmt = std::move(st18); }
| st19=createUserStatement { $stmt = std::move(st19); }
| st20=alterUserStatement { $stmt = std::move(st20); }
| st21=dropUserStatement { $stmt = std::move(st21); }
| st22=listUsersStatement { $stmt = std::move(st22); }
#if 0
| st23=createTriggerStatement { $stmt = st23; }
| st24=dropTriggerStatement { $stmt = st24; }
#endif
| st25=createTypeStatement { $stmt = st25; }
| st26=alterTypeStatement { $stmt = st26; }
| st27=dropTypeStatement { $stmt = st27; }
| st28=createFunctionStatement { $stmt = st28; }
| st29=dropFunctionStatement { $stmt = st29; }
| st25=createTypeStatement { $stmt = std::move(st25); }
| st26=alterTypeStatement { $stmt = std::move(st26); }
| st27=dropTypeStatement { $stmt = std::move(st27); }
| st28=createFunctionStatement { $stmt = std::move(st28); }
| st29=dropFunctionStatement { $stmt = std::move(st29); }
#if 0
| st30=createAggregateStatement { $stmt = st30; }
| st31=dropAggregateStatement { $stmt = st31; }
#endif
| st32=createViewStatement { $stmt = st32; }
| st33=alterViewStatement { $stmt = st33; }
| st34=dropViewStatement { $stmt = st34; }
| st35=listRolesStatement { $stmt = st35; }
| st36=grantRoleStatement { $stmt = st36; }
| st37=revokeRoleStatement { $stmt = st37; }
| st38=dropRoleStatement { $stmt = st38; }
| st39=createRoleStatement { $stmt = st39; }
| st40=alterRoleStatement { $stmt = st40; }
| st32=createViewStatement { $stmt = std::move(st32); }
| st33=alterViewStatement { $stmt = std::move(st33); }
| st34=dropViewStatement { $stmt = std::move(st34); }
| st35=listRolesStatement { $stmt = std::move(st35); }
| st36=grantRoleStatement { $stmt = std::move(st36); }
| st37=revokeRoleStatement { $stmt = std::move(st37); }
| st38=dropRoleStatement { $stmt = std::move(st38); }
| st39=createRoleStatement { $stmt = std::move(st39); }
| st40=alterRoleStatement { $stmt = std::move(st40); }
;
/*
* USE <KEYSPACE>;
*/
useStatement returns [::shared_ptr<raw::use_statement> stmt]
: K_USE ks=keyspaceName { $stmt = ::make_shared<raw::use_statement>(ks); }
useStatement returns [std::unique_ptr<raw::use_statement> stmt]
: K_USE ks=keyspaceName { $stmt = std::make_unique<raw::use_statement>(ks); }
;
/**
@@ -384,7 +384,7 @@ useStatement returns [::shared_ptr<raw::use_statement> stmt]
* WHERE KEY = "key1" AND COL > 1 AND COL < 100
* LIMIT <NUMBER>;
*/
selectStatement returns [shared_ptr<raw::select_statement> expr]
selectStatement returns [std::unique_ptr<raw::select_statement> expr]
@init {
bool is_distinct = false;
::shared_ptr<cql3::term::raw> limit;
@@ -409,7 +409,7 @@ selectStatement returns [shared_ptr<raw::select_statement> expr]
( K_BYPASS K_CACHE { bypass_cache = true; })?
{
auto params = make_lw_shared<raw::select_statement::parameters>(std::move(orderings), is_distinct, allow_filtering, is_json, bypass_cache);
$expr = ::make_shared<raw::select_statement>(std::move(cf), std::move(params),
$expr = std::make_unique<raw::select_statement>(std::move(cf), std::move(params),
std::move(sclause), std::move(wclause), std::move(limit), std::move(per_partition_limit),
std::move(gbcolumns));
}
@@ -476,7 +476,7 @@ jsonValue returns [::shared_ptr<cql3::term::raw> value]
* USING TIMESTAMP <long>;
*
*/
insertStatement returns [::shared_ptr<raw::modification_statement> expr]
insertStatement returns [std::unique_ptr<raw::modification_statement> expr]
@init {
auto attrs = std::make_unique<cql3::attributes::raw>();
std::vector<::shared_ptr<cql3::column_identifier::raw>> column_names;
@@ -492,7 +492,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
( usingClause[attrs] )?
{
$expr = ::make_shared<raw::insert_statement>(std::move(cf),
$expr = std::make_unique<raw::insert_statement>(std::move(cf),
std::move(attrs),
std::move(column_names),
std::move(values),
@@ -504,7 +504,7 @@ insertStatement returns [::shared_ptr<raw::modification_statement> expr]
( K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
( usingClause[attrs] )?
{
$expr = ::make_shared<raw::insert_json_statement>(std::move(cf),
$expr = std::make_unique<raw::insert_json_statement>(std::move(cf),
std::move(attrs),
std::move(json_value),
if_not_exists,
@@ -528,7 +528,7 @@ usingClauseObjective[std::unique_ptr<cql3::attributes::raw>& attrs]
* SET name1 = value1, name2 = value2
* WHERE key = value;
*/
updateStatement returns [::shared_ptr<raw::update_statement> expr]
updateStatement returns [std::unique_ptr<raw::update_statement> expr]
@init {
bool if_exists = false;
auto attrs = std::make_unique<cql3::attributes::raw>();
@@ -540,7 +540,7 @@ updateStatement returns [::shared_ptr<raw::update_statement> expr]
K_WHERE wclause=whereClause
( K_IF (K_EXISTS{ if_exists = true; } | conditions=updateConditions) )?
{
return ::make_shared<raw::update_statement>(std::move(cf),
return std::make_unique<raw::update_statement>(std::move(cf),
std::move(attrs),
std::move(operations),
std::move(wclause),
@@ -560,7 +560,7 @@ updateConditions returns [conditions_type conditions]
* WHERE KEY = keyname
[IF (EXISTS | name = value, ...)];
*/
deleteStatement returns [::shared_ptr<raw::delete_statement> expr]
deleteStatement returns [std::unique_ptr<raw::delete_statement> expr]
@init {
auto attrs = std::make_unique<cql3::attributes::raw>();
std::vector<::shared_ptr<cql3::operation::raw_deletion>> column_deletions;
@@ -572,7 +572,7 @@ deleteStatement returns [::shared_ptr<raw::delete_statement> expr]
K_WHERE wclause=whereClause
( K_IF ( K_EXISTS { if_exists = true; } | conditions=updateConditions ))?
{
return ::make_shared<raw::delete_statement>(cf,
return std::make_unique<raw::delete_statement>(cf,
std::move(attrs),
std::move(column_deletions),
std::move(wclause),
@@ -620,11 +620,11 @@ usingClauseDelete[std::unique_ptr<cql3::attributes::raw>& attrs]
* ...
* APPLY BATCH
*/
batchStatement returns [shared_ptr<cql3::statements::raw::batch_statement> expr]
batchStatement returns [std::unique_ptr<cql3::statements::raw::batch_statement> expr]
@init {
using btype = cql3::statements::raw::batch_statement::type;
btype type = btype::LOGGED;
std::vector<shared_ptr<cql3::statements::raw::modification_statement>> statements;
std::vector<std::unique_ptr<cql3::statements::raw::modification_statement>> statements;
auto attrs = std::make_unique<cql3::attributes::raw>();
}
: K_BEGIN
@@ -633,14 +633,14 @@ batchStatement returns [shared_ptr<cql3::statements::raw::batch_statement> expr]
( s=batchStatementObjective ';'? { statements.push_back(std::move(s)); } )*
K_APPLY K_BATCH
{
$expr = ::make_shared<cql3::statements::raw::batch_statement>(type, std::move(attrs), std::move(statements));
$expr = std::make_unique<cql3::statements::raw::batch_statement>(type, std::move(attrs), std::move(statements));
}
;
batchStatementObjective returns [shared_ptr<cql3::statements::raw::modification_statement> statement]
: i=insertStatement { $statement = i; }
| u=updateStatement { $statement = u; }
| d=deleteStatement { $statement = d; }
batchStatementObjective returns [std::unique_ptr<cql3::statements::raw::modification_statement> statement]
: i=insertStatement { $statement = std::move(i); }
| u=updateStatement { $statement = std::move(u); }
| d=deleteStatement { $statement = std::move(d); }
;
#if 0
@@ -694,7 +694,7 @@ dropAggregateStatement returns [DropAggregateStatement expr]
;
#endif
createFunctionStatement returns [shared_ptr<cql3::statements::create_function_statement> expr]
createFunctionStatement returns [std::unique_ptr<cql3::statements::create_function_statement> expr]
@init {
bool or_replace = false;
bool if_not_exists = false;
@@ -719,10 +719,10 @@ createFunctionStatement returns [shared_ptr<cql3::statements::create_function_st
K_RETURNS rt = comparatorType
K_LANGUAGE language = IDENT
K_AS body = STRING_LITERAL
{ $expr = ::make_shared<cql3::statements::create_function_statement>(std::move(fn), to_lower($language.text), $body.text, std::move(arg_names), std::move(arg_types), std::move(rt), called_on_null_input, or_replace, if_not_exists); }
{ $expr = std::make_unique<cql3::statements::create_function_statement>(std::move(fn), to_lower($language.text), $body.text, std::move(arg_names), std::move(arg_types), std::move(rt), called_on_null_input, or_replace, if_not_exists); }
;
dropFunctionStatement returns [shared_ptr<cql3::statements::drop_function_statement> expr]
dropFunctionStatement returns [std::unique_ptr<cql3::statements::drop_function_statement> expr]
@init {
bool if_exists = false;
std::vector<shared_ptr<cql3_type::raw>> arg_types;
@@ -740,19 +740,19 @@ dropFunctionStatement returns [shared_ptr<cql3::statements::drop_function_statem
')'
{ args_present = true; }
)?
{ $expr = ::make_shared<cql3::statements::drop_function_statement>(std::move(fn), std::move(arg_types), args_present, if_exists); }
{ $expr = std::make_unique<cql3::statements::drop_function_statement>(std::move(fn), std::move(arg_types), args_present, if_exists); }
;
/**
* CREATE KEYSPACE [IF NOT EXISTS] <KEYSPACE> WITH attr1 = value1 AND attr2 = value2;
*/
createKeyspaceStatement returns [shared_ptr<cql3::statements::create_keyspace_statement> expr]
createKeyspaceStatement returns [std::unique_ptr<cql3::statements::create_keyspace_statement> expr]
@init {
auto attrs = make_shared<cql3::statements::ks_prop_defs>();
bool if_not_exists = false;
}
: K_CREATE K_KEYSPACE (K_IF K_NOT K_EXISTS { if_not_exists = true; } )? ks=keyspaceName
K_WITH properties[attrs] { $expr = ::make_shared<cql3::statements::create_keyspace_statement>(ks, attrs, if_not_exists); }
K_WITH properties[*attrs] { $expr = std::make_unique<cql3::statements::create_keyspace_statement>(ks, attrs, if_not_exists); }
;
/**
@@ -762,33 +762,33 @@ createKeyspaceStatement returns [shared_ptr<cql3::statements::create_keyspace_st
* <name3> <type>
* ) WITH <property> = <value> AND ...;
*/
createTableStatement returns [shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
createTableStatement returns [std::unique_ptr<cql3::statements::create_table_statement::raw_statement> expr]
@init { bool if_not_exists = false; }
: K_CREATE K_COLUMNFAMILY (K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
cf=columnFamilyName { $expr = make_shared<cql3::statements::create_table_statement::raw_statement>(cf, if_not_exists); }
cfamDefinition[expr]
cf=columnFamilyName { $expr = std::make_unique<cql3::statements::create_table_statement::raw_statement>(cf, if_not_exists); }
cfamDefinition[*expr]
;
cfamDefinition[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
cfamDefinition[cql3::statements::create_table_statement::raw_statement& expr]
: '(' cfamColumns[expr] ( ',' cfamColumns[expr]? )* ')'
( K_WITH cfamProperty[$expr->properties()] ( K_AND cfamProperty[$expr->properties()] )*)?
( K_WITH cfamProperty[$expr.properties()] ( K_AND cfamProperty[$expr.properties()] )*)?
;
cfamColumns[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
cfamColumns[cql3::statements::create_table_statement::raw_statement& expr]
@init { bool is_static=false; }
: k=ident v=comparatorType (K_STATIC {is_static = true;})? { $expr->add_definition(k, v, is_static); }
(K_PRIMARY K_KEY { $expr->add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); })?
| K_PRIMARY K_KEY '(' pkDef[expr] (',' c=ident { $expr->add_column_alias(c); } )* ')'
: k=ident v=comparatorType (K_STATIC {is_static = true;})? { $expr.add_definition(k, v, is_static); }
(K_PRIMARY K_KEY { $expr.add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); })?
| K_PRIMARY K_KEY '(' pkDef[expr] (',' c=ident { $expr.add_column_alias(c); } )* ')'
;
pkDef[shared_ptr<cql3::statements::create_table_statement::raw_statement> expr]
pkDef[cql3::statements::create_table_statement::raw_statement& expr]
@init { std::vector<shared_ptr<cql3::column_identifier>> l; }
: k=ident { $expr->add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); }
| '(' k1=ident { l.push_back(k1); } ( ',' kn=ident { l.push_back(kn); } )* ')' { $expr->add_key_aliases(l); }
: k=ident { $expr.add_key_aliases(std::vector<shared_ptr<cql3::column_identifier>>{k}); }
| '(' k1=ident { l.push_back(k1); } ( ',' kn=ident { l.push_back(kn); } )* ')' { $expr.add_key_aliases(l); }
;
cfamProperty[cql3::statements::cf_properties& expr]
: property[$expr.properties()]
: property[*$expr.properties()]
| K_COMPACT K_STORAGE { $expr.set_compact_storage(); }
| K_CLUSTERING K_ORDER K_BY '(' cfamOrdering[expr] (',' cfamOrdering[expr])* ')'
;
@@ -806,15 +806,15 @@ cfamOrdering[cql3::statements::cf_properties& expr]
* ....
* )
*/
createTypeStatement returns [::shared_ptr<create_type_statement> expr]
createTypeStatement returns [std::unique_ptr<create_type_statement> expr]
@init { bool if_not_exists = false; }
: K_CREATE K_TYPE (K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
tn=userTypeName { $expr = ::make_shared<create_type_statement>(tn, if_not_exists); }
'(' typeColumns[expr] ( ',' typeColumns[expr]? )* ')'
tn=userTypeName { $expr = std::make_unique<create_type_statement>(tn, if_not_exists); }
'(' typeColumns[*expr] ( ',' typeColumns[*expr]? )* ')'
;
typeColumns[::shared_ptr<create_type_statement> expr]
: k=ident v=comparatorType { $expr->add_definition(k, v); }
typeColumns[create_type_statement& expr]
: k=ident v=comparatorType { $expr.add_definition(k, v); }
;
@@ -822,7 +822,7 @@ typeColumns[::shared_ptr<create_type_statement> expr]
* CREATE INDEX [IF NOT EXISTS] [indexName] ON <columnFamily> (<columnName>);
* CREATE CUSTOM INDEX [IF NOT EXISTS] [indexName] ON <columnFamily> (<columnName>) USING <indexClass>;
*/
createIndexStatement returns [::shared_ptr<create_index_statement> expr]
createIndexStatement returns [std::unique_ptr<create_index_statement> expr]
@init {
auto props = make_shared<index_prop_defs>();
bool if_not_exists = false;
@@ -830,10 +830,10 @@ createIndexStatement returns [::shared_ptr<create_index_statement> expr]
std::vector<::shared_ptr<index_target::raw>> targets;
}
: K_CREATE (K_CUSTOM { props->is_custom = true; })? K_INDEX (K_IF K_NOT K_EXISTS { if_not_exists = true; } )?
(idxName[name])? K_ON cf=columnFamilyName '(' (target1=indexIdent { targets.emplace_back(target1); } (',' target2=indexIdent { targets.emplace_back(target2); } )*)? ')'
(idxName[*name])? K_ON cf=columnFamilyName '(' (target1=indexIdent { targets.emplace_back(target1); } (',' target2=indexIdent { targets.emplace_back(target2); } )*)? ')'
(K_USING cls=STRING_LITERAL { props->custom_class = sstring{$cls.text}; })?
(K_WITH properties[props])?
{ $expr = ::make_shared<create_index_statement>(cf, name, targets, props, if_not_exists); }
(K_WITH properties[*props])?
{ $expr = std::make_unique<create_index_statement>(cf, name, targets, props, if_not_exists); }
;
indexIdent returns [::shared_ptr<index_target::raw> id]
@@ -856,7 +856,7 @@ indexIdent returns [::shared_ptr<index_target::raw> id]
* PRIMARY KEY (<pkColumns>)
* WITH <property> = <value> AND ...;
*/
createViewStatement returns [::shared_ptr<create_view_statement> expr]
createViewStatement returns [std::unique_ptr<create_view_statement> expr]
@init {
bool if_not_exists = false;
std::vector<::shared_ptr<cql3::column_identifier::raw>> partition_keys;
@@ -870,7 +870,7 @@ createViewStatement returns [::shared_ptr<create_view_statement> expr]
| '(' k1=cident { partition_keys.push_back(k1); } ( ',' cn=cident { composite_keys.push_back(cn); } )* ')'
)
{
$expr = ::make_shared<create_view_statement>(
$expr = std::make_unique<create_view_statement>(
std::move(cf),
std::move(basecf),
std::move(sclause),
@@ -909,12 +909,12 @@ dropTriggerStatement returns [DropTriggerStatement expr]
/**
* ALTER KEYSPACE <KS> WITH <property> = <value>;
*/
alterKeyspaceStatement returns [shared_ptr<cql3::statements::alter_keyspace_statement> expr]
alterKeyspaceStatement returns [std::unique_ptr<cql3::statements::alter_keyspace_statement> expr]
@init {
auto attrs = make_shared<cql3::statements::ks_prop_defs>();
}
: K_ALTER K_KEYSPACE ks=keyspaceName
K_WITH properties[attrs] { $expr = ::make_shared<cql3::statements::alter_keyspace_statement>(ks, attrs); }
K_WITH properties[*attrs] { $expr = std::make_unique<cql3::statements::alter_keyspace_statement>(ks, attrs); }
;
/**
@@ -924,7 +924,7 @@ alterKeyspaceStatement returns [shared_ptr<cql3::statements::alter_keyspace_stat
* ALTER COLUMN FAMILY <CF> WITH <property> = <value>;
* ALTER COLUMN FAMILY <CF> RENAME <column> TO <column>;
*/
alterTableStatement returns [shared_ptr<alter_table_statement> expr]
alterTableStatement returns [std::unique_ptr<alter_table_statement> expr]
@init {
alter_table_statement::type type;
auto props = make_shared<cql3::statements::cf_prop_defs>();
@@ -943,13 +943,13 @@ alterTableStatement returns [shared_ptr<alter_table_statement> expr]
| '(' id1=cident { column_changes.emplace_back(alter_table_statement::column_change{id1}); }
(',' idn=cident { column_changes.emplace_back(alter_table_statement::column_change{idn}); } )* ')'
)
| K_WITH properties[props] { type = alter_table_statement::type::opts; }
| K_WITH properties[*props] { type = alter_table_statement::type::opts; }
| K_RENAME { type = alter_table_statement::type::rename; }
id1=cident K_TO toId1=cident { renames.emplace_back(id1, toId1); }
( K_AND idn=cident K_TO toIdn=cident { renames.emplace_back(idn, toIdn); } )*
)
{
$expr = ::make_shared<alter_table_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames));
$expr = std::make_unique<alter_table_statement>(std::move(cf), type, std::move(column_changes), std::move(props), std::move(renames));
}
;
@@ -968,126 +968,126 @@ cfisStatic returns [bool isStaticColumn]
* ALTER TYPE <name> ADD <field> <newtype>;
* ALTER TYPE <name> RENAME <field> TO <newtype> AND ...;
*/
alterTypeStatement returns [::shared_ptr<alter_type_statement> expr]
alterTypeStatement returns [std::unique_ptr<alter_type_statement> expr]
: K_ALTER K_TYPE name=userTypeName
( K_ALTER f=ident K_TYPE v=comparatorType { $expr = ::make_shared<alter_type_statement::add_or_alter>(name, false, f, v); }
| K_ADD f=ident v=comparatorType { $expr = ::make_shared<alter_type_statement::add_or_alter>(name, true, f, v); }
( K_ALTER f=ident K_TYPE v=comparatorType { $expr = std::make_unique<alter_type_statement::add_or_alter>(name, false, f, v); }
| K_ADD f=ident v=comparatorType { $expr = std::make_unique<alter_type_statement::add_or_alter>(name, true, f, v); }
| K_RENAME
{ $expr = ::make_shared<alter_type_statement::renames>(name); }
renames[{ static_pointer_cast<alter_type_statement::renames>($expr) }]
{ $expr = std::make_unique<alter_type_statement::renames>(name); }
renames[{ static_cast<alter_type_statement::renames&>(*$expr) }]
)
;
/**
* ALTER MATERIALIZED VIEW <CF> WITH <property> = <value>;
*/
alterViewStatement returns [::shared_ptr<alter_view_statement> expr]
alterViewStatement returns [std::unique_ptr<alter_view_statement> expr]
@init {
auto props = make_shared<cql3::statements::cf_prop_defs>();
}
: K_ALTER K_MATERIALIZED K_VIEW cf=columnFamilyName K_WITH properties[props]
: K_ALTER K_MATERIALIZED K_VIEW cf=columnFamilyName K_WITH properties[*props]
{
$expr = ::make_shared<alter_view_statement>(std::move(cf), std::move(props));
$expr = std::make_unique<alter_view_statement>(std::move(cf), std::move(props));
}
;
renames[::shared_ptr<alter_type_statement::renames> expr]
: fromId=ident K_TO toId=ident { $expr->add_rename(fromId, toId); }
renames[alter_type_statement::renames& expr]
: fromId=ident K_TO toId=ident { $expr.add_rename(fromId, toId); }
( K_AND renames[$expr] )?
;
/**
* DROP KEYSPACE [IF EXISTS] <KSP>;
*/
dropKeyspaceStatement returns [::shared_ptr<drop_keyspace_statement> ksp]
dropKeyspaceStatement returns [std::unique_ptr<drop_keyspace_statement> ksp]
@init { bool if_exists = false; }
: K_DROP K_KEYSPACE (K_IF K_EXISTS { if_exists = true; } )? ks=keyspaceName { $ksp = ::make_shared<drop_keyspace_statement>(ks, if_exists); }
: K_DROP K_KEYSPACE (K_IF K_EXISTS { if_exists = true; } )? ks=keyspaceName { $ksp = std::make_unique<drop_keyspace_statement>(ks, if_exists); }
;
/**
* DROP COLUMNFAMILY [IF EXISTS] <CF>;
*/
dropTableStatement returns [::shared_ptr<drop_table_statement> stmt]
dropTableStatement returns [std::unique_ptr<drop_table_statement> stmt]
@init { bool if_exists = false; }
: K_DROP K_COLUMNFAMILY (K_IF K_EXISTS { if_exists = true; } )? cf=columnFamilyName { $stmt = ::make_shared<drop_table_statement>(cf, if_exists); }
: K_DROP K_COLUMNFAMILY (K_IF K_EXISTS { if_exists = true; } )? cf=columnFamilyName { $stmt = std::make_unique<drop_table_statement>(cf, if_exists); }
;
/**
* DROP TYPE <name>;
*/
dropTypeStatement returns [::shared_ptr<drop_type_statement> stmt]
dropTypeStatement returns [std::unique_ptr<drop_type_statement> stmt]
@init { bool if_exists = false; }
: K_DROP K_TYPE (K_IF K_EXISTS { if_exists = true; } )? name=userTypeName { $stmt = ::make_shared<drop_type_statement>(name, if_exists); }
: K_DROP K_TYPE (K_IF K_EXISTS { if_exists = true; } )? name=userTypeName { $stmt = std::make_unique<drop_type_statement>(name, if_exists); }
;
/**
* DROP MATERIALIZED VIEW [IF EXISTS] <view_name>
*/
dropViewStatement returns [::shared_ptr<drop_view_statement> stmt]
dropViewStatement returns [std::unique_ptr<drop_view_statement> stmt]
@init { bool if_exists = false; }
: K_DROP K_MATERIALIZED K_VIEW (K_IF K_EXISTS { if_exists = true; } )? cf=columnFamilyName
{ $stmt = ::make_shared<drop_view_statement>(cf, if_exists); }
{ $stmt = std::make_unique<drop_view_statement>(cf, if_exists); }
;
/**
* DROP INDEX [IF EXISTS] <INDEX_NAME>
*/
dropIndexStatement returns [::shared_ptr<drop_index_statement> expr]
dropIndexStatement returns [std::unique_ptr<drop_index_statement> expr]
@init { bool if_exists = false; }
: K_DROP K_INDEX (K_IF K_EXISTS { if_exists = true; } )? index=indexName
{ $expr = ::make_shared<drop_index_statement>(index, if_exists); }
{ $expr = std::make_unique<drop_index_statement>(index, if_exists); }
;
/**
* TRUNCATE <CF>;
*/
truncateStatement returns [::shared_ptr<truncate_statement> stmt]
: K_TRUNCATE (K_COLUMNFAMILY)? cf=columnFamilyName { $stmt = ::make_shared<truncate_statement>(cf); }
truncateStatement returns [std::unique_ptr<truncate_statement> stmt]
: K_TRUNCATE (K_COLUMNFAMILY)? cf=columnFamilyName { $stmt = std::make_unique<truncate_statement>(cf); }
;
/**
* GRANT <permission> ON <resource> TO <grantee>
*/
grantStatement returns [::shared_ptr<grant_statement> stmt]
grantStatement returns [std::unique_ptr<grant_statement> stmt]
: K_GRANT
permissionOrAll
K_ON
resource
K_TO
grantee=userOrRoleName
{ $stmt = ::make_shared<grant_statement>($permissionOrAll.perms, $resource.res, std::move(grantee)); }
{ $stmt = std::make_unique<grant_statement>($permissionOrAll.perms, $resource.res, std::move(grantee)); }
;
/**
* REVOKE <permission> ON <resource> FROM <revokee>
*/
revokeStatement returns [::shared_ptr<revoke_statement> stmt]
revokeStatement returns [std::unique_ptr<revoke_statement> stmt]
: K_REVOKE
permissionOrAll
K_ON
resource
K_FROM
revokee=userOrRoleName
{ $stmt = ::make_shared<revoke_statement>($permissionOrAll.perms, $resource.res, std::move(revokee)); }
{ $stmt = std::make_unique<revoke_statement>($permissionOrAll.perms, $resource.res, std::move(revokee)); }
;
/**
* GRANT <rolename> to <grantee>
*/
grantRoleStatement returns [::shared_ptr<grant_role_statement> stmt]
grantRoleStatement returns [std::unique_ptr<grant_role_statement> stmt]
: K_GRANT role=userOrRoleName K_TO grantee=userOrRoleName
{ $stmt = ::make_shared<grant_role_statement>(std::move(role), std::move(grantee)); }
{ $stmt = std::make_unique<grant_role_statement>(std::move(role), std::move(grantee)); }
;
/**
* REVOKE <rolename> FROM <revokee>
*/
revokeRoleStatement returns [::shared_ptr<revoke_role_statement> stmt]
revokeRoleStatement returns [std::unique_ptr<revoke_role_statement> stmt]
: K_REVOKE role=userOrRoleName K_FROM revokee=userOrRoleName
{ $stmt = ::make_shared<revoke_role_statement>(std::move(role), std::move(revokee)); }
{ $stmt = std::make_unique<revoke_role_statement>(std::move(role), std::move(revokee)); }
;
listPermissionsStatement returns [::shared_ptr<list_permissions_statement> stmt]
listPermissionsStatement returns [std::unique_ptr<list_permissions_statement> stmt]
@init {
std::optional<auth::resource> r;
std::optional<sstring> role;
@@ -1098,7 +1098,7 @@ listPermissionsStatement returns [::shared_ptr<list_permissions_statement> stmt]
( K_ON resource { r = $resource.res; } )?
( K_OF rn=userOrRoleName { role = sstring(static_cast<cql3::role_name>(rn).to_string()); } )?
( K_NORECURSIVE { recursive = false; } )?
{ $stmt = ::make_shared<list_permissions_statement>($permissionOrAll.perms, std::move(r), std::move(role), recursive); }
{ $stmt = std::make_unique<list_permissions_statement>($permissionOrAll.perms, std::move(r), std::move(role), recursive); }
;
permission returns [auth::permission perm]
@@ -1131,7 +1131,7 @@ roleResource returns [uninitialized<auth::resource> res]
/**
* CREATE USER [IF NOT EXISTS] <username> [WITH PASSWORD <password>] [SUPERUSER|NOSUPERUSER]
*/
createUserStatement returns [::shared_ptr<create_role_statement> stmt]
createUserStatement returns [std::unique_ptr<create_role_statement> stmt]
@init {
cql3::role_options opts;
opts.is_superuser = false;
@@ -1142,42 +1142,42 @@ createUserStatement returns [::shared_ptr<create_role_statement> stmt]
: K_CREATE K_USER (K_IF K_NOT K_EXISTS { ifNotExists = true; })? u=username
( K_WITH K_PASSWORD v=STRING_LITERAL { opts.password = $v.text; })?
( K_SUPERUSER { opts.is_superuser = true; } | K_NOSUPERUSER { opts.is_superuser = false; } )?
{ $stmt = ::make_shared<create_role_statement>(cql3::role_name(u, cql3::preserve_role_case::yes), std::move(opts), ifNotExists); }
{ $stmt = std::make_unique<create_role_statement>(cql3::role_name(u, cql3::preserve_role_case::yes), std::move(opts), ifNotExists); }
;
/**
* ALTER USER <username> [WITH PASSWORD <password>] [SUPERUSER|NOSUPERUSER]
*/
alterUserStatement returns [::shared_ptr<alter_role_statement> stmt]
alterUserStatement returns [std::unique_ptr<alter_role_statement> stmt]
@init {
cql3::role_options opts;
}
: K_ALTER K_USER u=username
( K_WITH K_PASSWORD v=STRING_LITERAL { opts.password = $v.text; })?
( K_SUPERUSER { opts.is_superuser = true; } | K_NOSUPERUSER { opts.is_superuser = false; } )?
{ $stmt = ::make_shared<alter_role_statement>(cql3::role_name(u, cql3::preserve_role_case::yes), std::move(opts)); }
{ $stmt = std::make_unique<alter_role_statement>(cql3::role_name(u, cql3::preserve_role_case::yes), std::move(opts)); }
;
/**
* DROP USER [IF EXISTS] <username>
*/
dropUserStatement returns [::shared_ptr<drop_role_statement> stmt]
dropUserStatement returns [std::unique_ptr<drop_role_statement> stmt]
@init { bool ifExists = false; }
: K_DROP K_USER (K_IF K_EXISTS { ifExists = true; })? u=username
{ $stmt = ::make_shared<drop_role_statement>(cql3::role_name(u, cql3::preserve_role_case::yes), ifExists); }
{ $stmt = std::make_unique<drop_role_statement>(cql3::role_name(u, cql3::preserve_role_case::yes), ifExists); }
;
/**
* LIST USERS
*/
listUsersStatement returns [::shared_ptr<list_users_statement> stmt]
: K_LIST K_USERS { $stmt = ::make_shared<list_users_statement>(); }
listUsersStatement returns [std::unique_ptr<list_users_statement> stmt]
: K_LIST K_USERS { $stmt = std::make_unique<list_users_statement>(); }
;
/**
* CREATE ROLE [IF NOT EXISTS] <role_name> [WITH <roleOption> [AND <roleOption>]*]
*/
createRoleStatement returns [::shared_ptr<create_role_statement> stmt]
createRoleStatement returns [std::unique_ptr<create_role_statement> stmt]
@init {
cql3::role_options opts;
opts.is_superuser = false;
@@ -1186,36 +1186,36 @@ createRoleStatement returns [::shared_ptr<create_role_statement> stmt]
}
: K_CREATE K_ROLE (K_IF K_NOT K_EXISTS { if_not_exists = true; })? name=userOrRoleName
(K_WITH roleOptions[opts])?
{ $stmt = ::make_shared<create_role_statement>(name, std::move(opts), if_not_exists); }
{ $stmt = std::make_unique<create_role_statement>(name, std::move(opts), if_not_exists); }
;
/**
* ALTER ROLE <rolename> [WITH <roleOption> [AND <roleOption>]*]
*/
alterRoleStatement returns [::shared_ptr<alter_role_statement> stmt]
alterRoleStatement returns [std::unique_ptr<alter_role_statement> stmt]
@init {
cql3::role_options opts;
}
: K_ALTER K_ROLE name=userOrRoleName
(K_WITH roleOptions[opts])?
{ $stmt = ::make_shared<alter_role_statement>(name, std::move(opts)); }
{ $stmt = std::make_unique<alter_role_statement>(name, std::move(opts)); }
;
/**
* DROP ROLE [IF EXISTS] <rolename>
*/
dropRoleStatement returns [::shared_ptr<drop_role_statement> stmt]
dropRoleStatement returns [std::unique_ptr<drop_role_statement> stmt]
@init {
bool if_exists = false;
}
: K_DROP K_ROLE (K_IF K_EXISTS { if_exists = true; })? name=userOrRoleName
{ $stmt = ::make_shared<drop_role_statement>(name, if_exists); }
{ $stmt = std::make_unique<drop_role_statement>(name, if_exists); }
;
/**
* LIST ROLES [OF <rolename>] [NORECURSIVE]
*/
listRolesStatement returns [::shared_ptr<list_roles_statement> stmt]
listRolesStatement returns [std::unique_ptr<list_roles_statement> stmt]
@init {
bool recursive = true;
std::optional<cql3::role_name> grantee;
@@ -1223,7 +1223,7 @@ listRolesStatement returns [::shared_ptr<list_roles_statement> stmt]
: K_LIST K_ROLES
(K_OF g=userOrRoleName { grantee = std::move(g); })?
(K_NORECURSIVE { recursive = false; })?
{ $stmt = ::make_shared<list_roles_statement>(grantee, recursive); }
{ $stmt = std::make_unique<list_roles_statement>(grantee, recursive); }
;
roleOptions[cql3::role_options& opts]
@@ -1258,17 +1258,17 @@ ident returns [shared_ptr<cql3::column_identifier> id]
// Keyspace & Column family names
keyspaceName returns [sstring id]
@init { auto name = make_shared<cql3::cf_name>(); }
: ksName[name] { $id = name->get_keyspace(); }
: ksName[*name] { $id = name->get_keyspace(); }
;
indexName returns [::shared_ptr<cql3::index_name> name]
@init { $name = ::make_shared<cql3::index_name>(); }
: (ksName[name] '.')? idxName[name]
: (ksName[*name] '.')? idxName[*name]
;
columnFamilyName returns [::shared_ptr<cql3::cf_name> name]
@init { $name = ::make_shared<cql3::cf_name>(); }
: (ksName[name] '.')? cfName[name]
: (ksName[*name] '.')? cfName[*name]
;
userTypeName returns [uninitialized<cql3::ut_name> name]
@@ -1283,24 +1283,24 @@ userOrRoleName returns [uninitialized<cql3::role_name> name]
| QMARK {add_recognition_error("Bind variables cannot be used for role names");}
;
ksName[::shared_ptr<cql3::keyspace_element_name> name]
: t=IDENT { $name->set_keyspace($t.text, false);}
| t=QUOTED_NAME { $name->set_keyspace($t.text, true);}
| k=unreserved_keyword { $name->set_keyspace(k, false);}
ksName[cql3::keyspace_element_name& name]
: t=IDENT { $name.set_keyspace($t.text, false);}
| t=QUOTED_NAME { $name.set_keyspace($t.text, true);}
| k=unreserved_keyword { $name.set_keyspace(k, false);}
| QMARK {add_recognition_error("Bind variables cannot be used for keyspace names");}
;
cfName[::shared_ptr<cql3::cf_name> name]
: t=IDENT { $name->set_column_family($t.text, false); }
| t=QUOTED_NAME { $name->set_column_family($t.text, true); }
| k=unreserved_keyword { $name->set_column_family(k, false); }
cfName[cql3::cf_name& name]
: t=IDENT { $name.set_column_family($t.text, false); }
| t=QUOTED_NAME { $name.set_column_family($t.text, true); }
| k=unreserved_keyword { $name.set_column_family(k, false); }
| QMARK {add_recognition_error("Bind variables cannot be used for table names");}
;
idxName[::shared_ptr<cql3::index_name> name]
: t=IDENT { $name->set_index($t.text, false); }
| t=QUOTED_NAME { $name->set_index($t.text, true);}
| k=unreserved_keyword { $name->set_index(k, false); }
idxName[cql3::index_name& name]
: t=IDENT { $name.set_index($t.text, false); }
| t=QUOTED_NAME { $name.set_index($t.text, true);}
| k=unreserved_keyword { $name.set_index(k, false); }
| QMARK {add_recognition_error("Bind variables cannot be used for index names");}
;
@@ -1489,13 +1489,13 @@ columnCondition[conditions_type& conditions]
)
;
properties[::shared_ptr<cql3::statements::property_definitions> props]
properties[cql3::statements::property_definitions& props]
: property[props] (K_AND property[props])*
;
property[::shared_ptr<cql3::statements::property_definitions> props]
: k=ident '=' simple=propertyValue { try { $props->add_property(k->to_string(), simple); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
| k=ident '=' map=mapLiteral { try { $props->add_property(k->to_string(), convert_property_map(map)); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
property[cql3::statements::property_definitions& props]
: k=ident '=' simple=propertyValue { try { $props.add_property(k->to_string(), simple); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
| k=ident '=' map=mapLiteral { try { $props.add_property(k->to_string(), convert_property_map(map)); } catch (exceptions::syntax_exception e) { add_recognition_error(e.what()); } }
;
propertyValue returns [sstring str]

View File

@@ -50,7 +50,7 @@
namespace cql3 {
abstract_marker::abstract_marker(int32_t bind_index, ::shared_ptr<column_specification>&& receiver)
abstract_marker::abstract_marker(int32_t bind_index, lw_shared_ptr<column_specification>&& receiver)
: _bind_index{bind_index}
, _receiver{std::move(receiver)}
{ }
@@ -67,7 +67,7 @@ abstract_marker::raw::raw(int32_t bind_index)
: _bind_index{bind_index}
{ }
::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const
::shared_ptr<term> abstract_marker::raw::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const
{
if (receiver->type->is_collection()) {
if (receiver->type->get_kind() == abstract_type::kind::list) {
@@ -87,7 +87,7 @@ abstract_marker::raw::raw(int32_t bind_index)
return ::make_shared<constants::marker>(_bind_index, receiver);
}
assignment_testable::test_result abstract_marker::raw::test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
assignment_testable::test_result abstract_marker::raw::test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
return assignment_testable::test_result::WEAKLY_ASSIGNABLE;
}
@@ -99,13 +99,13 @@ abstract_marker::in_raw::in_raw(int32_t bind_index)
: raw{bind_index}
{ }
::shared_ptr<column_specification> abstract_marker::in_raw::make_in_receiver(::shared_ptr<column_specification> receiver) {
auto in_name = ::make_shared<column_identifier>(sstring("in(") + receiver->name->to_string() + sstring(")"), true);
return ::make_shared<column_specification>(receiver->ks_name, receiver->cf_name, in_name, list_type_impl::get_instance(receiver->type, false));
lw_shared_ptr<column_specification> abstract_marker::in_raw::make_in_receiver(const column_specification& receiver) {
auto in_name = ::make_shared<column_identifier>(sstring("in(") + receiver.name->to_string() + sstring(")"), true);
return make_lw_shared<column_specification>(receiver.ks_name, receiver.cf_name, in_name, list_type_impl::get_instance(receiver.type, false));
}
::shared_ptr<term> abstract_marker::in_raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
return ::make_shared<lists::marker>(_bind_index, make_in_receiver(receiver));
::shared_ptr<term> abstract_marker::in_raw::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
return ::make_shared<lists::marker>(_bind_index, make_in_receiver(*receiver));
}
}

View File

@@ -53,9 +53,9 @@ namespace cql3 {
class abstract_marker : public non_terminal {
protected:
const int32_t _bind_index;
const ::shared_ptr<column_specification> _receiver;
const lw_shared_ptr<column_specification> _receiver;
public:
abstract_marker(int32_t bind_index, ::shared_ptr<column_specification>&& receiver);
abstract_marker(int32_t bind_index, lw_shared_ptr<column_specification>&& receiver);
virtual void collect_marker_specification(variable_specifications& bound_names) const override;
@@ -70,9 +70,9 @@ public:
public:
raw(int32_t bind_index);
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
virtual sstring to_string() const override;
};
@@ -87,9 +87,9 @@ public:
public:
in_raw(int32_t bind_index);
private:
static ::shared_ptr<column_specification> make_in_receiver(::shared_ptr<column_specification> receiver);
static lw_shared_ptr<column_specification> make_in_receiver(const column_specification& receiver);
public:
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
};
};

View File

@@ -70,7 +70,7 @@ public:
// Test all elements of toTest for assignment. If all are exact match, return exact match. If any is not assignable,
// return not assignable. Otherwise, return weakly assignable.
template <typename AssignmentTestablePtrRange>
static test_result test_all(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver,
static test_result test_all(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver,
AssignmentTestablePtrRange&& to_test) {
test_result res = test_result::EXACT_MATCH;
for (auto&& rt : to_test) {
@@ -99,7 +99,7 @@ public:
* Most caller should just call the isAssignable() method on the result, though functions have a use for
* testing "strong" equality to decide the most precise overload to pick when multiple could match.
*/
virtual test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const = 0;
virtual test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const = 0;
// for error reporting
virtual sstring assignment_testable_source_context() const = 0;

View File

@@ -135,12 +135,12 @@ std::unique_ptr<attributes> attributes::raw::prepare(database& db, const sstring
return std::unique_ptr<attributes>{new attributes{std::move(ts), std::move(ttl)}};
}
::shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
return ::make_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timestamp]", true), data_type_for<int64_t>());
lw_shared_ptr<column_specification> attributes::raw::timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const {
return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[timestamp]", true), data_type_for<int64_t>());
}
::shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const {
return ::make_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[ttl]", true), data_type_for<int32_t>());
lw_shared_ptr<column_specification> attributes::raw::time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const {
return make_lw_shared<column_specification>(ks_name, cf_name, ::make_shared<column_identifier>("[ttl]", true), data_type_for<int32_t>());
}
}

View File

@@ -78,9 +78,9 @@ public:
std::unique_ptr<attributes> prepare(database& db, const sstring& ks_name, const sstring& cf_name) const;
private:
::shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const;
lw_shared_ptr<column_specification> timestamp_receiver(const sstring& ks_name, const sstring& cf_name) const;
::shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;
lw_shared_ptr<column_specification> time_to_live_receiver(const sstring& ks_name, const sstring& cf_name) const;
};
};

View File

@@ -291,13 +291,13 @@ bool column_condition::applies_to(const data_value* cell_value, const query_opti
}
}
::shared_ptr<column_condition>
lw_shared_ptr<column_condition>
column_condition::raw::prepare(database& db, const sstring& keyspace, const column_definition& receiver) const {
if (receiver.type->is_counter()) {
throw exceptions::invalid_request_exception("Conditions on counters are not supported");
}
shared_ptr<term> collection_element_term;
shared_ptr<column_specification> value_spec = receiver.column_specification;
lw_shared_ptr<column_specification> value_spec = receiver.column_specification;
if (_collection_element) {
if (!receiver.type->is_collection()) {
@@ -306,7 +306,7 @@ column_condition::raw::prepare(database& db, const sstring& keyspace, const colu
}
// Pass a correct type specification to the collection_element->prepare(), so that it can
// later be used to validate the parameter type is compatible with receiver type.
shared_ptr<column_specification> element_spec;
lw_shared_ptr<column_specification> element_spec;
auto ctype = static_cast<const collection_type_impl*>(receiver.type.get());
const column_specification& recv_column_spec = *receiver.column_specification;
if (ctype->get_kind() == abstract_type::kind::list) {

View File

@@ -104,16 +104,16 @@ public:
* "IF col = 'foo'"
* "IF col LIKE <pattern>"
*/
static ::shared_ptr<column_condition> condition(const column_definition& def, ::shared_ptr<term> collection_element,
static lw_shared_ptr<column_condition> condition(const column_definition& def, ::shared_ptr<term> collection_element,
::shared_ptr<term> value, std::unique_ptr<like_matcher> matcher, const operator_type& op) {
return ::make_shared<column_condition>(def, std::move(collection_element), std::move(value),
return make_lw_shared<column_condition>(def, std::move(collection_element), std::move(value),
std::vector<::shared_ptr<term>>{}, std::move(matcher), op);
}
// Helper constructor wrapper for "IF col IN ... and IF col['key'] IN ... */
static ::shared_ptr<column_condition> in_condition(const column_definition& def, ::shared_ptr<term> collection_element,
static lw_shared_ptr<column_condition> in_condition(const column_definition& def, ::shared_ptr<term> collection_element,
::shared_ptr<term> in_marker, std::vector<::shared_ptr<term>> in_values) {
return ::make_shared<column_condition>(def, std::move(collection_element), std::move(in_marker),
return make_lw_shared<column_condition>(def, std::move(collection_element), std::move(in_marker),
std::move(in_values), nullptr, operator_type::IN);
}
@@ -146,9 +146,9 @@ public:
* "IF col = 'foo'"
* "IF col LIKE 'foo%'"
*/
static ::shared_ptr<raw> simple_condition(::shared_ptr<term::raw> value, ::shared_ptr<term::raw> collection_element,
static lw_shared_ptr<raw> simple_condition(::shared_ptr<term::raw> value, ::shared_ptr<term::raw> collection_element,
const operator_type& op) {
return ::make_shared<raw>(std::move(value), std::vector<::shared_ptr<term::raw>>{},
return make_lw_shared<raw>(std::move(value), std::vector<::shared_ptr<term::raw>>{},
::shared_ptr<abstract_marker::in_raw>{}, std::move(collection_element), op);
}
@@ -160,13 +160,13 @@ public:
* "IF col['key'] IN * ('foo', 'bar', ...)"
* "IF col['key'] IN ?"
*/
static ::shared_ptr<raw> in_condition(::shared_ptr<term::raw> collection_element,
static lw_shared_ptr<raw> in_condition(::shared_ptr<term::raw> collection_element,
::shared_ptr<abstract_marker::in_raw> in_marker, std::vector<::shared_ptr<term::raw>> in_values) {
return ::make_shared<raw>(::shared_ptr<term::raw>{}, std::move(in_values), std::move(in_marker),
return make_lw_shared<raw>(::shared_ptr<term::raw>{}, std::move(in_values), std::move(in_marker),
std::move(collection_element), operator_type::IN);
}
::shared_ptr<column_condition> prepare(database& db, const sstring& keyspace, const column_definition& receiver) const;
lw_shared_ptr<column_condition> prepare(database& db, const sstring& keyspace, const column_definition& receiver) const;
};
};

View File

@@ -51,7 +51,7 @@ column_specification::column_specification(std::string_view ks_name_, std::strin
{ }
bool column_specification::all_in_same_table(const std::vector<::shared_ptr<column_specification>>& names)
bool column_specification::all_in_same_table(const std::vector<lw_shared_ptr<column_specification>>& names)
{
assert(!names.empty());

View File

@@ -45,7 +45,6 @@
namespace cql3 {
class column_specification;
class column_identifier;
class column_specification final {
@@ -63,15 +62,15 @@ public:
* @param alias the column alias
* @return a new <code>ColumnSpecification</code> for the same column but with the specified alias.
*/
::shared_ptr<column_specification> with_alias(::shared_ptr<column_identifier> alias) {
return ::make_shared<column_specification>(ks_name, cf_name, alias, type);
lw_shared_ptr<column_specification> with_alias(::shared_ptr<column_identifier> alias) {
return make_lw_shared<column_specification>(ks_name, cf_name, alias, type);
}
bool is_reversed_type() const {
return ::dynamic_pointer_cast<const reversed_type_impl>(type) != nullptr;
}
static bool all_in_same_table(const std::vector<::shared_ptr<column_specification>>& names);
static bool all_in_same_table(const std::vector<lw_shared_ptr<column_specification>>& names);
};
}

View File

@@ -82,7 +82,7 @@ constants::literal::parsed_value(data_type validator) const
}
assignment_testable::test_result
constants::literal::test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const
constants::literal::test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const
{
auto receiver_type = receiver->type->as_cql3_type();
if (receiver_type.is_collection() || receiver_type.is_user_type()) {
@@ -155,7 +155,7 @@ constants::literal::test_assignment(database& db, const sstring& keyspace, ::sha
}
::shared_ptr<term>
constants::literal::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const
constants::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const
{
if (!is_assignable(test_assignment(db, keyspace, receiver))) {
throw exceptions::invalid_request_exception(format("Invalid {} constant ({}) for \"{}\" of type {}",

View File

@@ -87,7 +87,7 @@ public:
};
public:
static thread_local const ::shared_ptr<terminal> NULL_VALUE;
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override {
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override {
if (!is_assignable(test_assignment(db, keyspace, receiver))) {
throw exceptions::invalid_request_exception("Invalid null value for counter increment/decrement");
}
@@ -96,7 +96,7 @@ public:
virtual assignment_testable::test_result test_assignment(database& db,
const sstring& keyspace,
::shared_ptr<column_specification> receiver) const override {
lw_shared_ptr<column_specification> receiver) const override {
return receiver->type->is_counter()
? assignment_testable::test_result::NOT_ASSIGNABLE
: assignment_testable::test_result::WEAKLY_ASSIGNABLE;
@@ -153,7 +153,7 @@ public:
return ::make_shared<literal>(type::DURATION, text);
}
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
private:
bytes parsed_value(data_type validator) const;
public:
@@ -161,7 +161,7 @@ public:
return _text;
}
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const;
virtual sstring to_string() const override {
return _type == type::STRING ? sstring(format("'{}'", _text)) : _text;
@@ -170,7 +170,7 @@ public:
class marker : public abstract_marker {
public:
marker(int32_t bind_index, ::shared_ptr<column_specification> receiver)
marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver)
: abstract_marker{bind_index, std::move(receiver)}
{
assert(!_receiver->type->is_collection() && !_receiver->type->is_user_type());

View File

@@ -267,10 +267,13 @@ public:
}
};
/// The same as `impl_max_function_for' but without knowledge of `Type'.
/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
class impl_max_dynamic_function final : public aggregate_function::aggregate {
data_type _io_type;
opt_bytes _max;
public:
impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
virtual void reset() override {
_max = {};
}
@@ -278,12 +281,11 @@ public:
return _max.value_or(bytes{});
}
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
if (!values[0]) {
if (values.empty() || !values[0]) {
return;
}
const auto val = *values[0];
if (!_max || *_max < val) {
_max = val;
if (!_max || _io_type->less(*_max, *values[0])) {
_max = values[0];
}
}
};
@@ -298,10 +300,13 @@ public:
};
class max_dynamic_function final : public native_aggregate_function {
data_type _io_type;
public:
max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
max_dynamic_function(data_type io_type)
: native_aggregate_function("max", io_type, { io_type })
, _io_type(std::move(io_type)) {}
virtual std::unique_ptr<aggregate> new_aggregate() override {
return std::make_unique<impl_max_dynamic_function>();
return std::make_unique<impl_max_dynamic_function>(_io_type);
}
};
@@ -358,10 +363,13 @@ public:
}
};
/// The same as `impl_min_function_for' but without knowledge of `Type'.
/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
class impl_min_dynamic_function final : public aggregate_function::aggregate {
data_type _io_type;
opt_bytes _min;
public:
impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
virtual void reset() override {
_min = {};
}
@@ -369,12 +377,11 @@ public:
return _min.value_or(bytes{});
}
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
if (!values[0]) {
if (values.empty() || !values[0]) {
return;
}
const auto val = *values[0];
if (!_min || val < *_min) {
_min = val;
if (!_min || _io_type->less(*values[0], *_min)) {
_min = values[0];
}
}
};
@@ -389,10 +396,13 @@ public:
};
class min_dynamic_function final : public native_aggregate_function {
data_type _io_type;
public:
min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
min_dynamic_function(data_type io_type)
: native_aggregate_function("min", io_type, { io_type })
, _io_type(std::move(io_type)) {}
virtual std::unique_ptr<aggregate> new_aggregate() override {
return std::make_unique<impl_min_dynamic_function>();
return std::make_unique<impl_min_dynamic_function>(_io_type);
}
};

View File

@@ -63,7 +63,7 @@ public:
};
shared_ptr<function> make_castas_function(data_type to_type, data_type from_type, castas_fctn func) {
return ::make_shared<castas_function_for>(std::move(to_type), std::move(from_type), std::move(func));
return ::make_shared<castas_function_for>(std::move(to_type), std::move(from_type), func);
}
} /* Anonymous Namespace */
@@ -73,88 +73,69 @@ shared_ptr<function> make_castas_function(data_type to_type, data_type from_type
*/
namespace {
static data_value identity_castas_fctn(data_value val) {
return val;
}
using bytes_opt = std::optional<bytes>;
template<typename ToType, typename FromType>
std::function<data_value(data_value)> make_castas_fctn_simple() {
return [](data_value from) -> data_value {
auto val_from = value_cast<FromType>(from);
return static_cast<ToType>(val_from);
};
static data_value castas_fctn_simple(data_value from) {
auto val_from = value_cast<FromType>(from);
return static_cast<ToType>(val_from);
}
template<typename ToType>
std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_float() {
return [](data_value from) -> data_value {
auto val_from = value_cast<big_decimal>(from);
boost::multiprecision::cpp_int ten(10);
boost::multiprecision::cpp_rational r = val_from.unscaled_value();
r /= boost::multiprecision::pow(ten, val_from.scale());
return static_cast<ToType>(r);
};
static data_value castas_fctn_from_decimal_to_float(data_value from) {
auto val_from = value_cast<big_decimal>(from);
return static_cast<ToType>(val_from.as_rational());
}
static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
const auto& val_from = value_cast<big_decimal>(from);
boost::multiprecision::cpp_int ten(10);
return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
auto r = val_from.as_rational();
return utils::multiprecision_int(numerator(r)/denominator(r));
}
template<typename ToType>
std::function<data_value(data_value)> make_castas_fctn_from_varint_to_integer() {
return [](data_value from) -> data_value {
const auto& varint = value_cast<utils::multiprecision_int>(from);
return static_cast<ToType>(from_varint_to_integer(varint));
};
static data_value castas_fctn_from_varint_to_integer(data_value from) {
const auto& varint = value_cast<utils::multiprecision_int>(from);
return static_cast<ToType>(from_varint_to_integer(varint));
}
template<typename ToType>
std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_integer() {
return [](data_value from) -> data_value {
auto varint = from_decimal_to_cppint(from);
return static_cast<ToType>(from_varint_to_integer(varint));
};
static data_value castas_fctn_from_decimal_to_integer(data_value from) {
auto varint = from_decimal_to_cppint(from);
return static_cast<ToType>(from_varint_to_integer(varint));
}
std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_varint() {
return [](data_value from) -> data_value {
return from_decimal_to_cppint(from);
};
static data_value castas_fctn_from_decimal_to_varint(data_value from) {
return from_decimal_to_cppint(from);
}
template<typename FromType>
std::function<data_value(data_value)> make_castas_fctn_from_integer_to_decimal() {
return [](data_value from) -> data_value {
auto val_from = value_cast<FromType>(from);
return big_decimal(1, 10*static_cast<boost::multiprecision::cpp_int>(val_from));
};
static data_value castas_fctn_from_integer_to_decimal(data_value from) {
auto val_from = value_cast<FromType>(from);
return big_decimal(1, 10*static_cast<boost::multiprecision::cpp_int>(val_from));
}
template<typename FromType>
std::function<data_value(data_value)> make_castas_fctn_from_float_to_decimal() {
return [](data_value from) -> data_value {
auto val_from = value_cast<FromType>(from);
return big_decimal(boost::lexical_cast<std::string>(val_from));
};
static data_value castas_fctn_from_float_to_decimal(data_value from) {
auto val_from = value_cast<FromType>(from);
return big_decimal(boost::lexical_cast<std::string>(val_from));
}
template<typename FromType>
std::function<data_value(data_value)> make_castas_fctn_to_string() {
return [](data_value from) -> data_value {
return to_sstring(value_cast<FromType>(from));
};
static data_value castas_fctn_to_string(data_value from) {
return to_sstring(value_cast<FromType>(from));
}
std::function<data_value(data_value)> make_castas_fctn_from_varint_to_string() {
return [](data_value from) -> data_value {
return to_sstring(value_cast<utils::multiprecision_int>(from).str());
};
static data_value castas_fctn_from_varint_to_string(data_value from) {
return to_sstring(value_cast<utils::multiprecision_int>(from).str());
}
std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_string() {
return [](data_value from) -> data_value {
return value_cast<big_decimal>(from).to_string();
};
static data_value castas_fctn_from_decimal_to_string(data_value from) {
return value_cast<big_decimal>(from).to_string();
}
db_clock::time_point millis_to_time_point(const int64_t millis) {
@@ -177,178 +158,237 @@ db_clock::time_point date_to_time_point(const uint32_t date) {
return db_clock::time_point(std::chrono::duration_cast<db_clock::duration>(millis));
}
std::function<data_value(data_value)> make_castas_fctn_from_timestamp_to_date() {
return [](data_value from) -> data_value {
const auto val_from = value_cast<db_clock::time_point>(from);
return time_point_to_date(val_from);
};
static data_value castas_fctn_from_timestamp_to_date(data_value from) {
const auto val_from = value_cast<db_clock::time_point>(from);
return time_point_to_date(val_from);
}
std::function<data_value(data_value)> make_castas_fctn_from_date_to_timestamp() {
return [](data_value from) -> data_value {
const auto val_from = value_cast<uint32_t>(from);
return date_to_time_point(val_from);
};
static data_value castas_fctn_from_date_to_timestamp(data_value from) {
const auto val_from = value_cast<uint32_t>(from);
return date_to_time_point(val_from);
}
std::function<data_value(data_value)> make_castas_fctn_from_timeuuid_to_timestamp() {
return [](data_value from) -> data_value {
const auto val_from = value_cast<utils::UUID>(from);
return db_clock::time_point{db_clock::duration{utils::UUID_gen::unix_timestamp(val_from)}};
};
static data_value castas_fctn_from_timeuuid_to_timestamp(data_value from) {
const auto val_from = value_cast<utils::UUID>(from);
return db_clock::time_point{db_clock::duration{utils::UUID_gen::unix_timestamp(val_from)}};
}
std::function<data_value(data_value)> make_castas_fctn_from_timeuuid_to_date() {
return [](data_value from) -> data_value {
const auto val_from = value_cast<utils::UUID>(from);
return time_point_to_date(millis_to_time_point(utils::UUID_gen::unix_timestamp(val_from)));
};
static data_value castas_fctn_from_timeuuid_to_date(data_value from) {
const auto val_from = value_cast<utils::UUID>(from);
return time_point_to_date(millis_to_time_point(utils::UUID_gen::unix_timestamp(val_from)));
}
static std::function<data_value(data_value)> make_castas_fctn_from_dv_to_string() {
return [](data_value from) -> data_value {
return from.type()->to_string_impl(from);
};
static data_value castas_fctn_from_dv_to_string(data_value from) {
return from.type()->to_string_impl(from);
}
// FIXME: Add conversions for counters, after they are fully implemented...
// Map <ToType, FromType> -> castas_fctn
using castas_fctn_key = std::pair<data_type, data_type>;
struct castas_fctn_hash {
std::size_t operator()(const castas_fctn_key& x) const noexcept {
return boost::hash_value(x);
static constexpr unsigned next_power_of_2(unsigned val) {
unsigned ret = 1;
while (ret <= val) {
ret *= 2;
}
};
using castas_fctns_map = std::unordered_map<castas_fctn_key, castas_fctn, castas_fctn_hash>;
// List of supported castas functions...
thread_local castas_fctns_map castas_fctns {
{ {byte_type, byte_type}, make_castas_fctn_simple<int8_t, int8_t>() },
{ {byte_type, short_type}, make_castas_fctn_simple<int8_t, int16_t>() },
{ {byte_type, int32_type}, make_castas_fctn_simple<int8_t, int32_t>() },
{ {byte_type, long_type}, make_castas_fctn_simple<int8_t, int64_t>() },
{ {byte_type, float_type}, make_castas_fctn_simple<int8_t, float>() },
{ {byte_type, double_type}, make_castas_fctn_simple<int8_t, double>() },
{ {byte_type, varint_type}, make_castas_fctn_from_varint_to_integer<int8_t>() },
{ {byte_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int8_t>() },
{ {short_type, byte_type}, make_castas_fctn_simple<int16_t, int8_t>() },
{ {short_type, short_type}, make_castas_fctn_simple<int16_t, int16_t>() },
{ {short_type, int32_type}, make_castas_fctn_simple<int16_t, int32_t>() },
{ {short_type, long_type}, make_castas_fctn_simple<int16_t, int64_t>() },
{ {short_type, float_type}, make_castas_fctn_simple<int16_t, float>() },
{ {short_type, double_type}, make_castas_fctn_simple<int16_t, double>() },
{ {short_type, varint_type}, make_castas_fctn_from_varint_to_integer<int16_t>() },
{ {short_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int16_t>() },
{ {int32_type, byte_type}, make_castas_fctn_simple<int32_t, int8_t>() },
{ {int32_type, short_type}, make_castas_fctn_simple<int32_t, int16_t>() },
{ {int32_type, int32_type}, make_castas_fctn_simple<int32_t, int32_t>() },
{ {int32_type, long_type}, make_castas_fctn_simple<int32_t, int64_t>() },
{ {int32_type, float_type}, make_castas_fctn_simple<int32_t, float>() },
{ {int32_type, double_type}, make_castas_fctn_simple<int32_t, double>() },
{ {int32_type, varint_type}, make_castas_fctn_from_varint_to_integer<int32_t>() },
{ {int32_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int32_t>() },
{ {long_type, byte_type}, make_castas_fctn_simple<int64_t, int8_t>() },
{ {long_type, short_type}, make_castas_fctn_simple<int64_t, int16_t>() },
{ {long_type, int32_type}, make_castas_fctn_simple<int64_t, int32_t>() },
{ {long_type, long_type}, make_castas_fctn_simple<int64_t, int64_t>() },
{ {long_type, float_type}, make_castas_fctn_simple<int64_t, float>() },
{ {long_type, double_type}, make_castas_fctn_simple<int64_t, double>() },
{ {long_type, varint_type}, make_castas_fctn_from_varint_to_integer<int64_t>() },
{ {long_type, decimal_type}, make_castas_fctn_from_decimal_to_integer<int64_t>() },
{ {float_type, byte_type}, make_castas_fctn_simple<float, int8_t>() },
{ {float_type, short_type}, make_castas_fctn_simple<float, int16_t>() },
{ {float_type, int32_type}, make_castas_fctn_simple<float, int32_t>() },
{ {float_type, long_type}, make_castas_fctn_simple<float, int64_t>() },
{ {float_type, float_type}, make_castas_fctn_simple<float, float>() },
{ {float_type, double_type}, make_castas_fctn_simple<float, double>() },
{ {float_type, varint_type}, make_castas_fctn_simple<float, utils::multiprecision_int>() },
{ {float_type, decimal_type}, make_castas_fctn_from_decimal_to_float<float>() },
{ {double_type, byte_type}, make_castas_fctn_simple<double, int8_t>() },
{ {double_type, short_type}, make_castas_fctn_simple<double, int16_t>() },
{ {double_type, int32_type}, make_castas_fctn_simple<double, int32_t>() },
{ {double_type, long_type}, make_castas_fctn_simple<double, int64_t>() },
{ {double_type, float_type}, make_castas_fctn_simple<double, float>() },
{ {double_type, double_type}, make_castas_fctn_simple<double, double>() },
{ {double_type, varint_type}, make_castas_fctn_simple<double, utils::multiprecision_int>() },
{ {double_type, decimal_type}, make_castas_fctn_from_decimal_to_float<double>() },
{ {varint_type, byte_type}, make_castas_fctn_simple<utils::multiprecision_int, int8_t>() },
{ {varint_type, short_type}, make_castas_fctn_simple<utils::multiprecision_int, int16_t>() },
{ {varint_type, int32_type}, make_castas_fctn_simple<utils::multiprecision_int, int32_t>() },
{ {varint_type, long_type}, make_castas_fctn_simple<utils::multiprecision_int, int64_t>() },
{ {varint_type, float_type}, make_castas_fctn_simple<utils::multiprecision_int, float>() },
{ {varint_type, double_type}, make_castas_fctn_simple<utils::multiprecision_int, double>() },
{ {varint_type, varint_type}, make_castas_fctn_simple<utils::multiprecision_int, utils::multiprecision_int>() },
{ {varint_type, decimal_type}, make_castas_fctn_from_decimal_to_varint() },
{ {decimal_type, byte_type}, make_castas_fctn_from_integer_to_decimal<int8_t>() },
{ {decimal_type, short_type}, make_castas_fctn_from_integer_to_decimal<int16_t>() },
{ {decimal_type, int32_type}, make_castas_fctn_from_integer_to_decimal<int32_t>() },
{ {decimal_type, long_type}, make_castas_fctn_from_integer_to_decimal<int64_t>() },
{ {decimal_type, float_type}, make_castas_fctn_from_float_to_decimal<float>() },
{ {decimal_type, double_type}, make_castas_fctn_from_float_to_decimal<double>() },
{ {decimal_type, varint_type}, make_castas_fctn_from_integer_to_decimal<utils::multiprecision_int>() },
{ {decimal_type, decimal_type}, make_castas_fctn_simple<big_decimal, big_decimal>() },
{ {ascii_type, byte_type}, make_castas_fctn_to_string<int8_t>() },
{ {ascii_type, short_type}, make_castas_fctn_to_string<int16_t>() },
{ {ascii_type, int32_type}, make_castas_fctn_to_string<int32_t>() },
{ {ascii_type, long_type}, make_castas_fctn_to_string<int64_t>() },
{ {ascii_type, float_type}, make_castas_fctn_to_string<float>() },
{ {ascii_type, double_type}, make_castas_fctn_to_string<double>() },
{ {ascii_type, varint_type}, make_castas_fctn_from_varint_to_string() },
{ {ascii_type, decimal_type}, make_castas_fctn_from_decimal_to_string() },
{ {utf8_type, byte_type}, make_castas_fctn_to_string<int8_t>() },
{ {utf8_type, short_type}, make_castas_fctn_to_string<int16_t>() },
{ {utf8_type, int32_type}, make_castas_fctn_to_string<int32_t>() },
{ {utf8_type, long_type}, make_castas_fctn_to_string<int64_t>() },
{ {utf8_type, float_type}, make_castas_fctn_to_string<float>() },
{ {utf8_type, double_type}, make_castas_fctn_to_string<double>() },
{ {utf8_type, varint_type}, make_castas_fctn_from_varint_to_string() },
{ {utf8_type, decimal_type}, make_castas_fctn_from_decimal_to_string() },
{ {simple_date_type, timestamp_type}, make_castas_fctn_from_timestamp_to_date() },
{ {simple_date_type, timeuuid_type}, make_castas_fctn_from_timeuuid_to_date() },
{ {timestamp_type, simple_date_type}, make_castas_fctn_from_date_to_timestamp() },
{ {timestamp_type, timeuuid_type}, make_castas_fctn_from_timeuuid_to_timestamp() },
{ {ascii_type, timestamp_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, simple_date_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, time_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, timeuuid_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, uuid_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, boolean_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, inet_addr_type}, make_castas_fctn_from_dv_to_string() },
{ {ascii_type, ascii_type}, make_castas_fctn_simple<sstring, sstring>() },
{ {utf8_type, timestamp_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, simple_date_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, time_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, timeuuid_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, uuid_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, boolean_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, inet_addr_type}, make_castas_fctn_from_dv_to_string() },
{ {utf8_type, ascii_type}, make_castas_fctn_simple<sstring, sstring>() },
{ {utf8_type, utf8_type}, make_castas_fctn_simple<sstring, sstring>() },
};
return ret;
}
static constexpr unsigned next_kind_power_of_2 = next_power_of_2(static_cast<unsigned>(abstract_type::kind::last));
static constexpr unsigned cast_switch_case_val(abstract_type::kind A, abstract_type::kind B) {
return static_cast<unsigned>(A) * next_kind_power_of_2 + static_cast<unsigned>(B);
}
} /* Anonymous Namespace */
castas_fctn get_castas_fctn(data_type to_type, data_type from_type) {
auto it_candidate = castas_fctns.find(castas_fctn_key{to_type, from_type});
if (it_candidate == castas_fctns.end()) {
throw exceptions::invalid_request_exception(format("{} cannot be cast to {}", from_type->name(), to_type->name()));
if (from_type == to_type) {
// Casting any type to itself doesn't make sense, but it is
// harmless so allow it instead of reporting a confusing error
// message about TypeX not being castable to TypeX.
return identity_castas_fctn;
}
return it_candidate->second;
using kind = abstract_type::kind;
switch(cast_switch_case_val(to_type->get_kind(), from_type->get_kind())) {
case cast_switch_case_val(kind::byte, kind::short_kind):
return castas_fctn_simple<int8_t, int16_t>;
case cast_switch_case_val(kind::byte, kind::int32):
return castas_fctn_simple<int8_t, int32_t>;
case cast_switch_case_val(kind::byte, kind::long_kind):
return castas_fctn_simple<int8_t, int64_t>;
case cast_switch_case_val(kind::byte, kind::float_kind):
return castas_fctn_simple<int8_t, float>;
case cast_switch_case_val(kind::byte, kind::double_kind):
return castas_fctn_simple<int8_t, double>;
case cast_switch_case_val(kind::byte, kind::varint):
return castas_fctn_from_varint_to_integer<int8_t>;
case cast_switch_case_val(kind::byte, kind::decimal):
return castas_fctn_from_decimal_to_integer<int8_t>;
case cast_switch_case_val(kind::short_kind, kind::byte):
return castas_fctn_simple<int16_t, int8_t>;
case cast_switch_case_val(kind::short_kind, kind::int32):
return castas_fctn_simple<int16_t, int32_t>;
case cast_switch_case_val(kind::short_kind, kind::long_kind):
return castas_fctn_simple<int16_t, int64_t>;
case cast_switch_case_val(kind::short_kind, kind::float_kind):
return castas_fctn_simple<int16_t, float>;
case cast_switch_case_val(kind::short_kind, kind::double_kind):
return castas_fctn_simple<int16_t, double>;
case cast_switch_case_val(kind::short_kind, kind::varint):
return castas_fctn_from_varint_to_integer<int16_t>;
case cast_switch_case_val(kind::short_kind, kind::decimal):
return castas_fctn_from_decimal_to_integer<int16_t>;
case cast_switch_case_val(kind::int32, kind::byte):
return castas_fctn_simple<int32_t, int8_t>;
case cast_switch_case_val(kind::int32, kind::short_kind):
return castas_fctn_simple<int32_t, int16_t>;
case cast_switch_case_val(kind::int32, kind::long_kind):
return castas_fctn_simple<int32_t, int64_t>;
case cast_switch_case_val(kind::int32, kind::float_kind):
return castas_fctn_simple<int32_t, float>;
case cast_switch_case_val(kind::int32, kind::double_kind):
return castas_fctn_simple<int32_t, double>;
case cast_switch_case_val(kind::int32, kind::varint):
return castas_fctn_from_varint_to_integer<int32_t>;
case cast_switch_case_val(kind::int32, kind::decimal):
return castas_fctn_from_decimal_to_integer<int32_t>;
case cast_switch_case_val(kind::long_kind, kind::byte):
return castas_fctn_simple<int64_t, int8_t>;
case cast_switch_case_val(kind::long_kind, kind::short_kind):
return castas_fctn_simple<int64_t, int16_t>;
case cast_switch_case_val(kind::long_kind, kind::int32):
return castas_fctn_simple<int64_t, int32_t>;
case cast_switch_case_val(kind::long_kind, kind::float_kind):
return castas_fctn_simple<int64_t, float>;
case cast_switch_case_val(kind::long_kind, kind::double_kind):
return castas_fctn_simple<int64_t, double>;
case cast_switch_case_val(kind::long_kind, kind::varint):
return castas_fctn_from_varint_to_integer<int64_t>;
case cast_switch_case_val(kind::long_kind, kind::decimal):
return castas_fctn_from_decimal_to_integer<int64_t>;
case cast_switch_case_val(kind::float_kind, kind::byte):
return castas_fctn_simple<float, int8_t>;
case cast_switch_case_val(kind::float_kind, kind::short_kind):
return castas_fctn_simple<float, int16_t>;
case cast_switch_case_val(kind::float_kind, kind::int32):
return castas_fctn_simple<float, int32_t>;
case cast_switch_case_val(kind::float_kind, kind::long_kind):
return castas_fctn_simple<float, int64_t>;
case cast_switch_case_val(kind::float_kind, kind::double_kind):
return castas_fctn_simple<float, double>;
case cast_switch_case_val(kind::float_kind, kind::varint):
return castas_fctn_simple<float, utils::multiprecision_int>;
case cast_switch_case_val(kind::float_kind, kind::decimal):
return castas_fctn_from_decimal_to_float<float>;
case cast_switch_case_val(kind::double_kind, kind::byte):
return castas_fctn_simple<double, int8_t>;
case cast_switch_case_val(kind::double_kind, kind::short_kind):
return castas_fctn_simple<double, int16_t>;
case cast_switch_case_val(kind::double_kind, kind::int32):
return castas_fctn_simple<double, int32_t>;
case cast_switch_case_val(kind::double_kind, kind::long_kind):
return castas_fctn_simple<double, int64_t>;
case cast_switch_case_val(kind::double_kind, kind::float_kind):
return castas_fctn_simple<double, float>;
case cast_switch_case_val(kind::double_kind, kind::varint):
return castas_fctn_simple<double, utils::multiprecision_int>;
case cast_switch_case_val(kind::double_kind, kind::decimal):
return castas_fctn_from_decimal_to_float<double>;
case cast_switch_case_val(kind::varint, kind::byte):
return castas_fctn_simple<utils::multiprecision_int, int8_t>;
case cast_switch_case_val(kind::varint, kind::short_kind):
return castas_fctn_simple<utils::multiprecision_int, int16_t>;
case cast_switch_case_val(kind::varint, kind::int32):
return castas_fctn_simple<utils::multiprecision_int, int32_t>;
case cast_switch_case_val(kind::varint, kind::long_kind):
return castas_fctn_simple<utils::multiprecision_int, int64_t>;
case cast_switch_case_val(kind::varint, kind::float_kind):
return castas_fctn_simple<utils::multiprecision_int, float>;
case cast_switch_case_val(kind::varint, kind::double_kind):
return castas_fctn_simple<utils::multiprecision_int, double>;
case cast_switch_case_val(kind::varint, kind::decimal):
return castas_fctn_from_decimal_to_varint;
case cast_switch_case_val(kind::decimal, kind::byte):
return castas_fctn_from_integer_to_decimal<int8_t>;
case cast_switch_case_val(kind::decimal, kind::short_kind):
return castas_fctn_from_integer_to_decimal<int16_t>;
case cast_switch_case_val(kind::decimal, kind::int32):
return castas_fctn_from_integer_to_decimal<int32_t>;
case cast_switch_case_val(kind::decimal, kind::long_kind):
return castas_fctn_from_integer_to_decimal<int64_t>;
case cast_switch_case_val(kind::decimal, kind::float_kind):
return castas_fctn_from_float_to_decimal<float>;
case cast_switch_case_val(kind::decimal, kind::double_kind):
return castas_fctn_from_float_to_decimal<double>;
case cast_switch_case_val(kind::decimal, kind::varint):
return castas_fctn_from_integer_to_decimal<utils::multiprecision_int>;
case cast_switch_case_val(kind::ascii, kind::byte):
case cast_switch_case_val(kind::utf8, kind::byte):
return castas_fctn_to_string<int8_t>;
case cast_switch_case_val(kind::ascii, kind::short_kind):
case cast_switch_case_val(kind::utf8, kind::short_kind):
return castas_fctn_to_string<int16_t>;
case cast_switch_case_val(kind::ascii, kind::int32):
case cast_switch_case_val(kind::utf8, kind::int32):
return castas_fctn_to_string<int32_t>;
case cast_switch_case_val(kind::ascii, kind::long_kind):
case cast_switch_case_val(kind::utf8, kind::long_kind):
return castas_fctn_to_string<int64_t>;
case cast_switch_case_val(kind::ascii, kind::float_kind):
case cast_switch_case_val(kind::utf8, kind::float_kind):
return castas_fctn_to_string<float>;
case cast_switch_case_val(kind::ascii, kind::double_kind):
case cast_switch_case_val(kind::utf8, kind::double_kind):
return castas_fctn_to_string<double>;
case cast_switch_case_val(kind::ascii, kind::varint):
case cast_switch_case_val(kind::utf8, kind::varint):
return castas_fctn_from_varint_to_string;
case cast_switch_case_val(kind::ascii, kind::decimal):
case cast_switch_case_val(kind::utf8, kind::decimal):
return castas_fctn_from_decimal_to_string;
case cast_switch_case_val(kind::simple_date, kind::timestamp):
return castas_fctn_from_timestamp_to_date;
case cast_switch_case_val(kind::simple_date, kind::timeuuid):
return castas_fctn_from_timeuuid_to_date;
case cast_switch_case_val(kind::timestamp, kind::simple_date):
return castas_fctn_from_date_to_timestamp;
case cast_switch_case_val(kind::timestamp, kind::timeuuid):
return castas_fctn_from_timeuuid_to_timestamp;
case cast_switch_case_val(kind::ascii, kind::timestamp):
case cast_switch_case_val(kind::ascii, kind::simple_date):
case cast_switch_case_val(kind::ascii, kind::time):
case cast_switch_case_val(kind::ascii, kind::timeuuid):
case cast_switch_case_val(kind::ascii, kind::uuid):
case cast_switch_case_val(kind::ascii, kind::boolean):
case cast_switch_case_val(kind::ascii, kind::inet):
case cast_switch_case_val(kind::utf8, kind::timestamp):
case cast_switch_case_val(kind::utf8, kind::simple_date):
case cast_switch_case_val(kind::utf8, kind::time):
case cast_switch_case_val(kind::utf8, kind::timeuuid):
case cast_switch_case_val(kind::utf8, kind::uuid):
case cast_switch_case_val(kind::utf8, kind::boolean):
case cast_switch_case_val(kind::utf8, kind::inet):
return castas_fctn_from_dv_to_string;
case cast_switch_case_val(kind::utf8, kind::ascii):
return castas_fctn_simple<sstring, sstring>;
}
throw exceptions::invalid_request_exception(format("{} cannot be cast to {}", from_type->name(), to_type->name()));
}
shared_ptr<function> castas_functions::get(data_type to_type, const std::vector<shared_ptr<cql3::selection::selector>>& provided_args) {

View File

@@ -58,7 +58,7 @@ namespace functions {
* Support for CAST(. AS .) functions.
*/
using castas_fctn = std::function<data_value(data_value)>;
using castas_fctn = data_value(*)(data_value);
castas_fctn get_castas_fctn(data_type to_type, data_type from_type);

View File

@@ -74,12 +74,12 @@ public:
raw(function_name name, std::vector<shared_ptr<term::raw>> terms)
: _name(std::move(name)), _terms(std::move(terms)) {
}
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
private:
// All parameters must be terminal
static bytes_opt execute(scalar_function& fun, std::vector<shared_ptr<term>> parameters);
public:
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const override;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
virtual sstring to_string() const override;
};
};

View File

@@ -141,12 +141,12 @@ void functions::remove_function(const function_name& name, const std::vector<dat
with_udf_iter(name, arg_types, [] (functions::declared_t::iterator i) { _declared.erase(i); });
}
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
functions::make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
const function& fun, size_t i) {
auto&& name = boost::lexical_cast<std::string>(fun.name());
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
return ::make_shared<column_specification>(receiver_ks,
return make_lw_shared<column_specification>(receiver_ks,
receiver_cf,
::make_shared<column_identifier>(format("arg{:d}({})", i, name), true),
fun.arg_types()[i]);
@@ -187,7 +187,7 @@ functions::get(database& db,
const std::vector<shared_ptr<assignment_testable>>& provided_args,
const sstring& receiver_ks,
const sstring& receiver_cf,
shared_ptr<column_specification> receiver) {
lw_shared_ptr<column_specification> receiver) {
static const function_name TOKEN_FUNCTION_NAME = function_name::native_function("token");
static const function_name TO_JSON_FUNCTION_NAME = function_name::native_function("tojson");
@@ -507,7 +507,7 @@ function_call::make_terminal(shared_ptr<function> fun, cql3::raw_value result, c
}
::shared_ptr<term>
function_call::raw::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
function_call::raw::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
std::vector<shared_ptr<assignment_testable>> args;
args.reserve(_terms.size());
std::transform(_terms.begin(), _terms.end(), std::back_inserter(args),
@@ -572,7 +572,7 @@ function_call::raw::execute(scalar_function& fun, std::vector<shared_ptr<term>>
}
assignment_testable::test_result
function_call::raw::test_assignment(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const {
function_call::raw::test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
// Note: Functions.get() will return null if the function doesn't exist, or throw is no function matching
// the arguments can be found. We may get one of those if an undefined/wrong function is used as argument
// of another, existing, function. In that case, we return true here because we'll throw a proper exception

View File

@@ -67,7 +67,7 @@ class functions {
private:
static std::unordered_multimap<function_name, shared_ptr<function>> init();
public:
static shared_ptr<column_specification> make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
static lw_shared_ptr<column_specification> make_arg_spec(const sstring& receiver_ks, const sstring& receiver_cf,
const function& fun, size_t i);
static int get_overload_count(const function_name& name);
public:
@@ -77,7 +77,7 @@ public:
const std::vector<shared_ptr<assignment_testable>>& provided_args,
const sstring& receiver_ks,
const sstring& receiver_cf,
::shared_ptr<column_specification> receiver = nullptr);
lw_shared_ptr<column_specification> receiver = nullptr);
template <typename AssignmentTestablePtrRange>
static shared_ptr<function> get(database& db,
const sstring& keyspace,
@@ -85,7 +85,7 @@ public:
AssignmentTestablePtrRange&& provided_args,
const sstring& receiver_ks,
const sstring& receiver_cf,
::shared_ptr<column_specification> receiver = nullptr) {
lw_shared_ptr<column_specification> receiver = nullptr) {
const std::vector<shared_ptr<assignment_testable>> args(std::begin(provided_args), std::end(provided_args));
return get(db, keyspace, name, args, receiver_ks, receiver_cf, receiver);
}

View File

@@ -30,28 +30,28 @@
namespace cql3 {
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
lists::index_spec_of(const column_specification& column) {
return ::make_shared<column_specification>(column.ks_name, column.cf_name,
return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
::make_shared<column_identifier>(format("idx({})", *column.name), true), int32_type);
}
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
lists::value_spec_of(const column_specification& column) {
return ::make_shared<column_specification>(column.ks_name, column.cf_name,
return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
::make_shared<column_identifier>(format("value({})", *column.name), true),
dynamic_pointer_cast<const list_type_impl>(column.type)->get_elements_type());
}
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
lists::uuid_index_spec_of(const column_specification& column) {
return ::make_shared<column_specification>(column.ks_name, column.cf_name,
return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
::make_shared<column_identifier>(format("uuid_idx({})", *column.name), true), uuid_type);
}
shared_ptr<term>
lists::literal::prepare(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const {
lists::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
validate_assignable_to(db, keyspace, *receiver);
// In Cassandra, an empty (unfrozen) map/set/list is equivalent to the column being null. In
@@ -101,7 +101,7 @@ lists::literal::validate_assignable_to(database& db, const sstring keyspace, con
}
assignment_testable::test_result
lists::literal::test_assignment(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const {
lists::literal::test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
if (!dynamic_pointer_cast<const list_type_impl>(receiver->type)) {
return assignment_testable::test_result::NOT_ASSIGNABLE;
}
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
collection_mutation_description mut;
mut.cells.reserve(1);
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
if (!value) {
mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
} else {
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
}
m.set_cell(prefix, column, mut.serialize(*ltype));
}

View File

@@ -54,9 +54,9 @@ namespace cql3 {
class lists {
lists() = delete;
public:
static shared_ptr<column_specification> index_spec_of(const column_specification&);
static shared_ptr<column_specification> value_spec_of(const column_specification&);
static shared_ptr<column_specification> uuid_index_spec_of(const column_specification&);
static lw_shared_ptr<column_specification> index_spec_of(const column_specification&);
static lw_shared_ptr<column_specification> value_spec_of(const column_specification&);
static lw_shared_ptr<column_specification> uuid_index_spec_of(const column_specification&);
class literal : public term::raw {
const std::vector<shared_ptr<term::raw>> _elements;
@@ -64,11 +64,11 @@ public:
explicit literal(std::vector<shared_ptr<term::raw>> elements)
: _elements(std::move(elements)) {
}
virtual shared_ptr<term> prepare(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const override;
virtual shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
private:
void validate_assignable_to(database& db, const sstring keyspace, const column_specification& receiver) const;
public:
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const override;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
virtual sstring to_string() const override;
};
@@ -113,7 +113,7 @@ public:
*/
class marker : public abstract_marker {
public:
marker(int32_t bind_index, ::shared_ptr<column_specification> receiver)
marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver)
: abstract_marker{bind_index, std::move(receiver)}
{ }
virtual ::shared_ptr<terminal> bind(const query_options& options) override;

View File

@@ -51,22 +51,22 @@
namespace cql3 {
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
maps::key_spec_of(const column_specification& column) {
return ::make_shared<column_specification>(column.ks_name, column.cf_name,
return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
::make_shared<column_identifier>(format("key({})", *column.name), true),
dynamic_pointer_cast<const map_type_impl>(column.type)->get_keys_type());
}
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
maps::value_spec_of(const column_specification& column) {
return ::make_shared<column_specification>(column.ks_name, column.cf_name,
return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
::make_shared<column_identifier>(format("value({})", *column.name), true),
dynamic_pointer_cast<const map_type_impl>(column.type)->get_values_type());
}
::shared_ptr<term>
maps::literal::prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
maps::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
validate_assignable_to(db, keyspace, *receiver);
auto key_spec = maps::key_spec_of(*receiver);
@@ -114,7 +114,7 @@ maps::literal::validate_assignable_to(database& db, const sstring& keyspace, con
}
assignment_testable::test_result
maps::literal::test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const {
maps::literal::test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
if (!dynamic_pointer_cast<const map_type_impl>(receiver->type)) {
return assignment_testable::test_result::NOT_ASSIGNABLE;
}

View File

@@ -56,8 +56,8 @@ class maps {
private:
maps() = delete;
public:
static shared_ptr<column_specification> key_spec_of(const column_specification& column);
static shared_ptr<column_specification> value_spec_of(const column_specification& column);
static lw_shared_ptr<column_specification> key_spec_of(const column_specification& column);
static lw_shared_ptr<column_specification> value_spec_of(const column_specification& column);
class literal : public term::raw {
public:
@@ -66,11 +66,11 @@ public:
literal(const std::vector<std::pair<::shared_ptr<term::raw>, ::shared_ptr<term::raw>>>& entries_)
: entries{entries_}
{ }
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual ::shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
private:
void validate_assignable_to(database& db, const sstring& keyspace, const column_specification& receiver) const;
public:
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
virtual sstring to_string() const override;
};
@@ -104,7 +104,7 @@ public:
class marker : public abstract_marker {
public:
marker(int32_t bind_index, ::shared_ptr<column_specification> receiver)
marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver)
: abstract_marker{bind_index, std::move(receiver)}
{ }
virtual ::shared_ptr<terminal> bind(const query_options& options) override;

View File

@@ -140,7 +140,7 @@ protected:
virtual shared_ptr<restrictions::restriction> new_EQ_restriction(database& db, schema_ptr schema,
variable_specifications& bound_names) override {
auto rs = receivers(db, *schema);
std::vector<::shared_ptr<column_specification>> col_specs(rs.size());
std::vector<lw_shared_ptr<column_specification>> col_specs(rs.size());
std::transform(rs.begin(), rs.end(), col_specs.begin(), [] (auto cs) {
return cs->column_specification;
});
@@ -151,7 +151,7 @@ protected:
virtual shared_ptr<restrictions::restriction> new_IN_restriction(database& db, schema_ptr schema,
variable_specifications& bound_names) override {
auto rs = receivers(db, *schema);
std::vector<::shared_ptr<column_specification>> col_specs(rs.size());
std::vector<lw_shared_ptr<column_specification>> col_specs(rs.size());
std::transform(rs.begin(), rs.end(), col_specs.begin(), [] (auto cs) {
return cs->column_specification;
});
@@ -175,7 +175,7 @@ protected:
variable_specifications& bound_names,
statements::bound bound, bool inclusive) override {
auto rs = receivers(db, *schema);
std::vector<::shared_ptr<column_specification>> col_specs(rs.size());
std::vector<lw_shared_ptr<column_specification>> col_specs(rs.size());
std::transform(rs.begin(), rs.end(), col_specs.begin(), [] (auto cs) {
return cs->column_specification;
});
@@ -200,7 +200,7 @@ protected:
return ::make_shared(multi_column_relation(std::move(new_entities), _relation_type, _values_or_marker, _in_values, _in_marker));
}
virtual shared_ptr<term> to_term(const std::vector<shared_ptr<column_specification>>& receivers,
virtual shared_ptr<term> to_term(const std::vector<lw_shared_ptr<column_specification>>& receivers,
const term::raw& raw, database& db, const sstring& keyspace,
variable_specifications& bound_names) const override {
const auto& as_multi_column_raw = dynamic_cast<const term::multi_column_raw&>(raw);

View File

@@ -216,7 +216,7 @@ operation::subtraction::prepare(database& db, const sstring& keyspace, const col
} else if (ctype->get_kind() == abstract_type::kind::map) {
auto&& mtype = dynamic_pointer_cast<const map_type_impl>(ctype);
// The value for a map subtraction is actually a set
auto&& vr = ::make_shared<column_specification>(
auto&& vr = make_lw_shared<column_specification>(
receiver.column_specification->ks_name,
receiver.column_specification->cf_name,
receiver.column_specification->name,
@@ -294,7 +294,7 @@ operation::set_counter_value_from_tuple_list::prepare(database& db, const sstrin
// We need to fake a column of list<tuple<...>> to prepare the value term
auto & os = receiver.column_specification;
auto spec = ::make_shared<cql3::column_specification>(os->ks_name, os->cf_name, os->name, counter_tuple_list_type);
auto spec = make_lw_shared<cql3::column_specification>(os->ks_name, os->cf_name, os->name, counter_tuple_list_type);
auto v = _value->prepare(db, keyspace, spec);
// Will not be used elsewhere, so make it local.

View File

@@ -189,7 +189,7 @@ bytes_view query_options::linearize(fragmented_temporary_buffer::view view) cons
}
}
void query_options::prepare(const std::vector<::shared_ptr<column_specification>>& specs)
void query_options::prepare(const std::vector<lw_shared_ptr<column_specification>>& specs)
{
if (!_names) {
return;

View File

@@ -245,7 +245,7 @@ public:
return _cql_config;
}
void prepare(const std::vector<::shared_ptr<column_specification>>& specs);
void prepare(const std::vector<lw_shared_ptr<column_specification>>& specs);
private:
void fill_value_views();
};

View File

@@ -510,7 +510,7 @@ query_processor::execute_prepared(
if (needs_authorization) {
fut = statement->check_access(_proxy, query_state.get_client_state()).then([this, &query_state, prepared = std::move(prepared), cache_key = std::move(cache_key)] () mutable {
return _authorized_prepared_cache.insert(*query_state.get_client_state().user(), std::move(cache_key), std::move(prepared)).handle_exception([this] (auto eptr) {
log.error("failed to cache the entry", eptr);
log.error("failed to cache the entry: {}", eptr);
});
});
}
@@ -607,10 +607,10 @@ prepared_cache_key_type query_processor::compute_thrift_id(
std::unique_ptr<prepared_statement>
query_processor::get_statement(const sstring_view& query, const service::client_state& client_state) {
::shared_ptr<raw::parsed_statement> statement = parse_statement(query);
std::unique_ptr<raw::parsed_statement> statement = parse_statement(query);
// Set keyspace for statement that require login
auto cf_stmt = dynamic_pointer_cast<raw::cf_statement>(statement);
auto cf_stmt = dynamic_cast<raw::cf_statement*>(statement.get());
if (cf_stmt) {
cf_stmt->prepare_keyspace(client_state);
}
@@ -620,7 +620,7 @@ query_processor::get_statement(const sstring_view& query, const service::client_
return p;
}
::shared_ptr<raw::parsed_statement>
std::unique_ptr<raw::parsed_statement>
query_processor::parse_statement(const sstring_view& query) {
try {
auto statement = util::do_with_parser(query, std::mem_fn(&cql3_parser::CqlParser::query));
@@ -853,7 +853,7 @@ query_processor::execute_batch(
return batch->check_access(_proxy, query_state.get_client_state()).then([this, &query_state, &options, batch, pending_authorization_entries = std::move(pending_authorization_entries)] () mutable {
return parallel_for_each(pending_authorization_entries, [this, &query_state] (auto& e) {
return _authorized_prepared_cache.insert(*query_state.get_client_state().user(), e.first, std::move(e.second)).handle_exception([this] (auto eptr) {
log.error("failed to cache the entry", eptr);
log.error("failed to cache the entry: {}", eptr);
});
}).then([this, &query_state, &options, batch] {
batch->validate();

View File

@@ -147,7 +147,7 @@ public:
const std::string_view& query_string,
const sstring& keyspace);
static ::shared_ptr<statements::raw::parsed_statement> parse_statement(const std::string_view& query);
static std::unique_ptr<statements::raw::parsed_statement> parse_statement(const std::string_view& query);
query_processor(service::storage_proxy& proxy, database& db, service::migration_notifier& mn, memory_config mcfg, cql_config& cql_cfg);

View File

@@ -49,7 +49,7 @@ relation::to_column_definition(const schema& schema, const column_identifier::ra
auto id = entity.prepare_column_identifier(schema);
auto def = get_column_definition(schema, *id);
if (!def || def->is_hidden_from_cql()) {
throw exceptions::unrecognized_entity_exception(id, shared_from_this());
throw exceptions::unrecognized_entity_exception(*id, to_string());
}
return *def;
}

View File

@@ -249,7 +249,7 @@ protected:
* @return the <code>Term</code> corresponding to the specified <code>Raw</code>
* @throws InvalidRequestException if the <code>Raw</code> term is not valid
*/
virtual ::shared_ptr<term> to_term(const std::vector<::shared_ptr<column_specification>>& receivers,
virtual ::shared_ptr<term> to_term(const std::vector<lw_shared_ptr<column_specification>>& receivers,
const term::raw& raw,
database& db,
const sstring& keyspace,
@@ -265,7 +265,7 @@ protected:
* @return the <code>Term</code>s corresponding to the specified <code>Raw</code> terms
* @throws InvalidRequestException if the <code>Raw</code> terms are not valid
*/
std::vector<::shared_ptr<term>> to_terms(const std::vector<::shared_ptr<column_specification>>& receivers,
std::vector<::shared_ptr<term>> to_terms(const std::vector<lw_shared_ptr<column_specification>>& receivers,
const std::vector<::shared_ptr<term::raw>>& raws,
database& db,
const sstring& keyspace,

View File

@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
::shared_ptr<single_column_restriction> restr;
if (single_pk_restrs) {
if (single_ck_restrs) {
auto it = single_ck_restrs->restrictions().find(cdef);
if (it != single_ck_restrs->restrictions().end()) {
restr = dynamic_pointer_cast<single_column_restriction>(it->second);
@@ -624,9 +624,6 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
const row& cells,
const query_options& options,
gc_clock::time_point now) const {
if (_column_def.type->is_counter()) {
fail(unimplemented::cause::COUNTERS);
}
auto operand = value(options);
if (operand) {
auto cell_value = get_value(schema, key, ckey, cells, now);
@@ -641,11 +638,11 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
}
bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
if (_column_def.type->is_counter()) {
fail(unimplemented::cause::COUNTERS);
}
auto operand = value(options);
return operand && _column_def.type->compare(*operand, data) == 0;
if (!operand) {
throw exceptions::invalid_request_exception(format("Invalid null value for {}", _column_def.name_as_text()));
}
return _column_def.type->compare(*operand, data) == 0;
}
bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
@@ -654,9 +651,6 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
const row& cells,
const query_options& options,
gc_clock::time_point now) const {
if (_column_def.type->is_counter()) {
fail(unimplemented::cause::COUNTERS);
}
auto cell_value = get_value(schema, key, ckey, cells, now);
if (!cell_value) {
return false;
@@ -670,16 +664,13 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
}
bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
if (_column_def.type->is_counter()) {
fail(unimplemented::cause::COUNTERS);
}
auto operands = values(options);
return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
return operand && _column_def.type->compare(*operand, data) == 0;
});
}
static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options) {
static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options, const sstring& name) {
using range_type = query::range<bytes_view>;
auto extract_bound = [&] (statements::bound bound) -> std::optional<range_type::bound> {
if (!slice.has_bound(bound)) {
@@ -687,7 +678,7 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
}
auto value = slice.bound(bound)->bind_and_get(options);
if (!value) {
return { };
throw exceptions::invalid_request_exception(format("Invalid null bound for {}", name));
}
auto value_view = options.linearize(*value);
return { range_type::bound(value_view, slice.is_inclusive(bound)) };
@@ -697,6 +688,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
extract_bound(statements::bound::END));
}
static bool contains_without_wraparound(
const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
return !range.is_wrap_around(cmp) && range.contains(value, cmp);
}
bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
const partition_key& key,
const clustering_key_prefix& ckey,
@@ -711,15 +707,14 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
return false;
}
return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
return to_range(_slice, options).contains(cell_value_bv, _column_def.type->as_tri_comparator());
return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
cell_value_bv, _column_def.type->as_tri_comparator());
});
}
bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
if (_column_def.type->is_counter()) {
fail(unimplemented::cause::COUNTERS);
}
return to_range(_slice, options).contains(data, _column_def.type->underlying_type()->as_tri_comparator());
return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
data, _column_def.type->underlying_type()->as_tri_comparator());
}
bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
@@ -728,9 +723,6 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
const row& cells,
const query_options& options,
gc_clock::time_point now) const {
if (_column_def.type->is_counter()) {
fail(unimplemented::cause::COUNTERS);
}
if (!_column_def.type->is_collection()) {
return false;
}
@@ -881,7 +873,9 @@ bool single_column_restriction::contains::is_satisfied_by(bytes_view collection_
auto map_key = _entry_keys[i]->bind_and_get(options);
auto map_value = _entry_values[i]->bind_and_get(options);
if (!map_key || !map_value) {
continue;
throw exceptions::invalid_request_exception(
format("Unsupported null map {} for column {}",
map_key ? "key" : "value", _column_def.name_as_text()));
}
auto found = with_linearized(*map_key, [&] (bytes_view map_key_bv) {
return std::find_if(data_map.begin(), data_map.end(), [&] (auto&& element) {
@@ -929,7 +923,7 @@ bool token_restriction::slice::is_satisfied_by(const schema& schema,
const query_options& options,
gc_clock::time_point now) const {
bool satisfied = false;
auto range = to_range(_slice, options);
auto range = to_range(_slice, options, "token");
for (auto* cdef : _column_definitions) {
auto cell_value = do_get_value(schema, *cdef, key, ckey, cells, now);
if (!cell_value) {

View File

@@ -43,12 +43,12 @@
namespace cql3 {
metadata::metadata(std::vector<::shared_ptr<column_specification>> names_)
metadata::metadata(std::vector<lw_shared_ptr<column_specification>> names_)
: _flags(flag_enum_set())
, _column_info(make_lw_shared<column_info>(std::move(names_)))
{ }
metadata::metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
metadata::metadata(flag_enum_set flags, std::vector<lw_shared_ptr<column_specification>> names_, uint32_t column_count,
lw_shared_ptr<const service::pager::paging_state> paging_state)
: _flags(flags)
, _column_info(make_lw_shared<column_info>(std::move(names_), column_count))
@@ -60,7 +60,7 @@ uint32_t metadata::value_count() const {
return _flags.contains<flag::NO_METADATA>() ? _column_info->_column_count : _column_info->_names.size();
}
void metadata::add_non_serialized_column(::shared_ptr<column_specification> name) {
void metadata::add_non_serialized_column(lw_shared_ptr<column_specification> name) {
// See comment above. Because columnCount doesn't account the newly added name, it
// won't be serialized.
_column_info->_names.emplace_back(std::move(name));
@@ -101,7 +101,7 @@ lw_shared_ptr<const service::pager::paging_state> metadata::paging_state() const
return _paging_state;
}
prepared_metadata::prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
prepared_metadata::prepared_metadata(const std::vector<lw_shared_ptr<column_specification>>& names,
const std::vector<uint16_t>& partition_key_bind_indices)
: _names{names}
, _partition_key_bind_indices{partition_key_bind_indices}
@@ -115,7 +115,7 @@ prepared_metadata::flag_enum_set prepared_metadata::flags() const {
return _flags;
}
const std::vector<::shared_ptr<column_specification>>& prepared_metadata::names() const {
const std::vector<lw_shared_ptr<column_specification>>& prepared_metadata::names() const {
return _names;
}
@@ -123,7 +123,7 @@ const std::vector<uint16_t>& prepared_metadata::partition_key_bind_indices() con
return _partition_key_bind_indices;
}
result_set::result_set(std::vector<::shared_ptr<column_specification>> metadata_)
result_set::result_set(std::vector<lw_shared_ptr<column_specification>> metadata_)
: _metadata(::make_shared<metadata>(std::move(metadata_)))
{ }
@@ -179,7 +179,7 @@ const std::deque<std::vector<bytes_opt>>& result_set::rows() const {
shared_ptr<const cql3::metadata>
make_empty_metadata() {
static thread_local shared_ptr<const metadata> empty_metadata_cache = [] {
auto result = ::make_shared<metadata>(std::vector<::shared_ptr<cql3::column_specification>>{});
auto result = ::make_shared<metadata>(std::vector<lw_shared_ptr<cql3::column_specification>>{});
result->set_skip_metadata();
return result;
}();

View File

@@ -74,15 +74,15 @@ public:
// used to include columns in the resultSet that we need to do post-query re-orderings
// (SelectStatement.orderResults) but that shouldn't be sent to the user as they haven't been requested
// (CASSANDRA-4911). So the serialization code will exclude any columns in name whose index is >= columnCount.
std::vector<::shared_ptr<column_specification>> _names;
std::vector<lw_shared_ptr<column_specification>> _names;
uint32_t _column_count;
column_info(std::vector<::shared_ptr<column_specification>> names, uint32_t column_count)
column_info(std::vector<lw_shared_ptr<column_specification>> names, uint32_t column_count)
: _names(std::move(names))
, _column_count(column_count)
{ }
explicit column_info(std::vector<::shared_ptr<column_specification>> names)
explicit column_info(std::vector<lw_shared_ptr<column_specification>> names)
: _names(std::move(names))
, _column_count(_names.size())
{ }
@@ -95,15 +95,15 @@ private:
lw_shared_ptr<const service::pager::paging_state> _paging_state;
public:
metadata(std::vector<::shared_ptr<column_specification>> names_);
metadata(std::vector<lw_shared_ptr<column_specification>> names_);
metadata(flag_enum_set flags, std::vector<::shared_ptr<column_specification>> names_, uint32_t column_count,
metadata(flag_enum_set flags, std::vector<lw_shared_ptr<column_specification>> names_, uint32_t column_count,
lw_shared_ptr<const service::pager::paging_state> paging_state);
// The maximum number of values that the ResultSet can hold. This can be bigger than columnCount due to CASSANDRA-4911
uint32_t value_count() const;
void add_non_serialized_column(::shared_ptr<column_specification> name);
void add_non_serialized_column(lw_shared_ptr<column_specification> name);
private:
bool all_in_same_cf() const;
@@ -120,7 +120,7 @@ public:
lw_shared_ptr<const service::pager::paging_state> paging_state() const;
const std::vector<::shared_ptr<column_specification>>& get_names() const {
const std::vector<lw_shared_ptr<column_specification>>& get_names() const {
return _column_info->_names;
}
};
@@ -139,14 +139,14 @@ public:
using flag_enum_set = enum_set<flag_enum>;
private:
flag_enum_set _flags;
std::vector<::shared_ptr<column_specification>> _names;
std::vector<lw_shared_ptr<column_specification>> _names;
std::vector<uint16_t> _partition_key_bind_indices;
public:
prepared_metadata(const std::vector<::shared_ptr<column_specification>>& names,
prepared_metadata(const std::vector<lw_shared_ptr<column_specification>>& names,
const std::vector<uint16_t>& partition_key_bind_indices);
flag_enum_set flags() const;
const std::vector<::shared_ptr<column_specification>>& names() const;
const std::vector<lw_shared_ptr<column_specification>>& names() const;
const std::vector<uint16_t>& partition_key_bind_indices() const;
};
@@ -167,7 +167,7 @@ class result_set {
friend class result;
public:
result_set(std::vector<::shared_ptr<column_specification>> metadata_);
result_set(std::vector<lw_shared_ptr<column_specification>> metadata_);
result_set(::shared_ptr<metadata> metadata);

View File

@@ -56,7 +56,7 @@ namespace selection {
selection::selection(schema_ptr schema,
std::vector<const column_definition*> columns,
std::vector<::shared_ptr<column_specification>> metadata_,
std::vector<lw_shared_ptr<column_specification>> metadata_,
bool collect_timestamps,
bool collect_TTLs,
trivial is_trivial)
@@ -92,7 +92,7 @@ private:
const bool _is_wildcard;
public:
static ::shared_ptr<simple_selection> make(schema_ptr schema, std::vector<const column_definition*> columns, bool is_wildcard) {
std::vector<::shared_ptr<column_specification>> metadata;
std::vector<lw_shared_ptr<column_specification>> metadata;
metadata.reserve(columns.size());
for (auto&& col : columns) {
metadata.emplace_back(col->column_specification);
@@ -106,7 +106,7 @@ public:
* get much duplicate in practice, it's more efficient not to bother.
*/
simple_selection(schema_ptr schema, std::vector<const column_definition*> columns,
std::vector<::shared_ptr<column_specification>> metadata, bool is_wildcard)
std::vector<lw_shared_ptr<column_specification>> metadata, bool is_wildcard)
: selection(schema, std::move(columns), std::move(metadata), false, false, trivial::yes)
, _is_wildcard(is_wildcard)
{ }
@@ -155,7 +155,7 @@ private:
::shared_ptr<selector_factories> _factories;
public:
selection_with_processing(schema_ptr schema, std::vector<const column_definition*> columns,
std::vector<::shared_ptr<column_specification>> metadata, ::shared_ptr<selector_factories> factories)
std::vector<lw_shared_ptr<column_specification>> metadata, ::shared_ptr<selector_factories> factories)
: selection(schema, std::move(columns), std::move(metadata),
factories->contains_write_time_selector_factory(),
factories->contains_ttl_selector_factory())
@@ -264,14 +264,14 @@ uint32_t selection::add_column_for_post_processing(const column_definition& c) {
}
}
std::vector<::shared_ptr<column_specification>>
std::vector<lw_shared_ptr<column_specification>>
selection::collect_metadata(const schema& schema, const std::vector<::shared_ptr<raw_selector>>& raw_selectors,
const selector_factories& factories) {
std::vector<::shared_ptr<column_specification>> r;
std::vector<lw_shared_ptr<column_specification>> r;
r.reserve(raw_selectors.size());
auto i = raw_selectors.begin();
for (auto&& factory : factories) {
::shared_ptr<column_specification> col_spec = factory->get_column_specification(schema);
lw_shared_ptr<column_specification> col_spec = factory->get_column_specification(schema);
::shared_ptr<column_identifier> alias = (*i++)->alias;
r.push_back(alias ? col_spec->with_alias(alias) : col_spec);
}

View File

@@ -99,7 +99,7 @@ protected:
selection(schema_ptr schema,
std::vector<const column_definition*> columns,
std::vector<::shared_ptr<column_specification>> metadata_,
std::vector<lw_shared_ptr<column_specification>> metadata_,
bool collect_timestamps,
bool collect_TTLs, trivial is_trivial = trivial::no);
@@ -197,7 +197,7 @@ private:
[] (auto&& s) { return s->processes_selection(); });
}
static std::vector<::shared_ptr<column_specification>> collect_metadata(const schema& schema,
static std::vector<lw_shared_ptr<column_specification>> collect_metadata(const schema& schema,
const std::vector<::shared_ptr<raw_selector>>& raw_selectors, const selector_factories& factories);
public:
static ::shared_ptr<selection> from_selectors(database& db, schema_ptr schema, const std::vector<::shared_ptr<raw_selector>>& raw_selectors);
@@ -268,7 +268,7 @@ public:
if (_selectors->requires_thread()) {
return async(std::move(func));
} else {
return futurize_apply(std::move(func));
return futurize_invoke(std::move(func));
}
}

View File

@@ -26,9 +26,9 @@ namespace cql3 {
namespace selection {
::shared_ptr<column_specification>
lw_shared_ptr<column_specification>
selector::factory::get_column_specification(const schema& schema) const {
return ::make_shared<column_specification>(schema.ks_name(),
return make_lw_shared<column_specification>(schema.ks_name(),
schema.cf_name(),
::make_shared<column_identifier>(column_name(), true),
get_return_type());

View File

@@ -107,7 +107,7 @@ public:
*/
virtual void reset() = 0;
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, ::shared_ptr<column_specification> receiver) const override {
virtual assignment_testable::test_result test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override {
auto t1 = receiver->type->underlying_type();
auto t2 = get_type()->underlying_type();
// We want columns of `counter_type' to be served by underlying type's overloads
@@ -142,7 +142,7 @@ public:
* @param schema the column family schema
* @return a column specification
*/
::shared_ptr<column_specification> get_column_specification(const schema& schema) const;
lw_shared_ptr<column_specification> get_column_specification(const schema& schema) const;
/**
* Creates a new <code>selector</code> instance.

View File

@@ -27,15 +27,15 @@
namespace cql3 {
shared_ptr<column_specification>
lw_shared_ptr<column_specification>
sets::value_spec_of(const column_specification& column) {
return ::make_shared<column_specification>(column.ks_name, column.cf_name,
return make_lw_shared<column_specification>(column.ks_name, column.cf_name,
::make_shared<column_identifier>(format("value({})", *column.name), true),
dynamic_pointer_cast<const set_type_impl>(column.type)->get_elements_type());
}
shared_ptr<term>
sets::literal::prepare(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const {
sets::literal::prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
validate_assignable_to(db, keyspace, *receiver);
if (_elements.empty()) {
@@ -105,7 +105,7 @@ sets::literal::validate_assignable_to(database& db, const sstring& keyspace, con
}
assignment_testable::test_result
sets::literal::test_assignment(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const {
sets::literal::test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const {
if (!dynamic_pointer_cast<const set_type_impl>(receiver->type)) {
// We've parsed empty maps as a set literal to break the ambiguity so handle that case now
if (dynamic_pointer_cast<const map_type_impl>(receiver->type) && _elements.empty()) {
@@ -224,7 +224,7 @@ sets::delayed_value::bind(const query_options& options) {
}
sets::marker::marker(int32_t bind_index, ::shared_ptr<column_specification> receiver)
sets::marker::marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver)
: abstract_marker{bind_index, std::move(receiver)} {
assert(dynamic_cast<const set_type_impl*>(_receiver->type.get()));
}

View File

@@ -56,7 +56,7 @@ namespace cql3 {
class sets {
sets() = delete;
public:
static shared_ptr<column_specification> value_spec_of(const column_specification& column);
static lw_shared_ptr<column_specification> value_spec_of(const column_specification& column);
class literal : public term::raw {
std::vector<shared_ptr<term::raw>> _elements;
@@ -64,10 +64,10 @@ public:
explicit literal(std::vector<shared_ptr<term::raw>> elements)
: _elements(std::move(elements)) {
}
virtual shared_ptr<term> prepare(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const override;
virtual shared_ptr<term> prepare(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const override;
void validate_assignable_to(database& db, const sstring& keyspace, const column_specification& receiver) const;
assignment_testable::test_result
test_assignment(database& db, const sstring& keyspace, shared_ptr<column_specification> receiver) const;
test_assignment(database& db, const sstring& keyspace, lw_shared_ptr<column_specification> receiver) const;
virtual sstring to_string() const override;
};
@@ -100,7 +100,7 @@ public:
class marker : public abstract_marker {
public:
marker(int32_t bind_index, ::shared_ptr<column_specification> receiver);
marker(int32_t bind_index, lw_shared_ptr<column_specification> receiver);
virtual ::shared_ptr<terminal> bind(const query_options& options) override;
};

View File

@@ -53,7 +53,7 @@ using namespace cql3::restrictions;
namespace cql3 {
::shared_ptr<term>
single_column_relation::to_term(const std::vector<::shared_ptr<column_specification>>& receivers,
single_column_relation::to_term(const std::vector<lw_shared_ptr<column_specification>>& receivers,
const term::raw& raw,
database& db,
const sstring& keyspace,
@@ -107,7 +107,7 @@ single_column_relation::new_LIKE_restriction(
return ::make_shared<single_column_restriction::LIKE>(column_def, std::move(term));
}
std::vector<::shared_ptr<column_specification>>
std::vector<lw_shared_ptr<column_specification>>
single_column_relation::to_receivers(const schema& schema, const column_definition& column_def) const
{
using namespace statements::request_validations;

View File

@@ -117,7 +117,7 @@ public:
}
protected:
virtual ::shared_ptr<term> to_term(const std::vector<::shared_ptr<column_specification>>& receivers,
virtual ::shared_ptr<term> to_term(const std::vector<lw_shared_ptr<column_specification>>& receivers,
const term::raw& raw, database& db, const sstring& keyspace,
variable_specifications& bound_names) const override;
@@ -202,9 +202,9 @@ private:
* @return the receivers for the specified relation.
* @throws exceptions::invalid_request_exception if the relation is invalid
*/
std::vector<::shared_ptr<column_specification>> to_receivers(const schema& schema, const column_definition& column_def) const;
std::vector<lw_shared_ptr<column_specification>> to_receivers(const schema& schema, const column_definition& column_def) const;
static shared_ptr<column_specification> make_collection_receiver(shared_ptr<column_specification> receiver, bool for_key) {
static lw_shared_ptr<column_specification> make_collection_receiver(lw_shared_ptr<column_specification> receiver, bool for_key) {
return static_cast<const collection_type_impl*>(receiver->type.get())->make_collection_receiver(*receiver, for_key);
}

View File

@@ -62,6 +62,8 @@ public:
, _options(std::move(options)) {
}
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
void validate(service::storage_proxy&, const service::client_state&) const override;
virtual future<> check_access(service::storage_proxy& proxy, const service::client_state&) const override;

View File

@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
"because a collection with the same name and a different type has already been used in the past", column_name));
}
}
if (type->is_counter() && !schema.is_counter()) {
throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
}
cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);
@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
schema_builder builder(view);
if (view->view_info()->include_all_columns()) {
builder.with_column(column_name.name(), type);
} else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
} else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
db::view::create_virtual_column(builder, column_name.name(), type);
}
view_updates.push_back(view_ptr(builder.build()));

View File

@@ -46,11 +46,6 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
return 0;
}
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::authentication_statement::prepare(
database& db, cql_stats& stats) {
return std::make_unique<prepared_statement>(this->shared_from_this());
}
bool cql3::statements::authentication_statement::uses_function(
const sstring& ks_name, const sstring& function_name) const {
return parsed_statement::uses_function(ks_name, function_name);

View File

@@ -50,14 +50,12 @@ namespace cql3 {
namespace statements {
class authentication_statement : public raw::parsed_statement, public cql_statement_no_metadata, public ::enable_shared_from_this<authentication_statement> {
class authentication_statement : public raw::parsed_statement, public cql_statement_no_metadata {
public:
authentication_statement() : cql_statement_no_metadata(&timeout_config::other_timeout) {}
uint32_t get_bound_terms() const override;
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
bool uses_function(const sstring& ks_name, const sstring& function_name) const override;
bool depends_on_keyspace(const sstring& ks_name) const override;

View File

@@ -46,11 +46,6 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
return 0;
}
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::authorization_statement::prepare(
database& db, cql_stats& stats) {
return std::make_unique<prepared_statement>(this->shared_from_this());
}
bool cql3::statements::authorization_statement::uses_function(
const sstring& ks_name, const sstring& function_name) const {
return parsed_statement::uses_function(ks_name, function_name);

View File

@@ -54,14 +54,12 @@ namespace cql3 {
namespace statements {
class authorization_statement : public raw::parsed_statement, public cql_statement_no_metadata, public ::enable_shared_from_this<authorization_statement> {
class authorization_statement : public raw::parsed_statement, public cql_statement_no_metadata {
public:
authorization_statement() : cql_statement_no_metadata(&timeout_config::other_timeout) {}
uint32_t get_bound_terms() const override;
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
bool uses_function(const sstring& ks_name, const sstring& function_name) const override;
bool depends_on_keyspace(const sstring& ks_name) const override;

View File

@@ -68,6 +68,7 @@ batch_statement::batch_statement(int bound_terms, type type_,
, _has_conditions(boost::algorithm::any_of(_statements, [] (auto&& s) { return s.statement->has_conditions(); }))
, _stats(stats)
{
validate();
if (has_conditions()) {
// A batch can be created not only by raw::batch_statement::prepare, but also by
// cql_server::connection::process_batch, which doesn't call any methods of
@@ -340,7 +341,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
const query_options& options,
service::query_state& qs) const {
auto cl_for_commit = options.get_consistency();
auto cl_for_learn = options.get_consistency();
auto cl_for_paxos = options.check_serial_consistency();
seastar::shared_ptr<cas_request> request;
schema_ptr schema;
@@ -378,7 +379,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
}
auto shard = service::storage_proxy::cas_shard(*_statements[0].statement->s, request->key()[0].start()->value().as_decorated_key().token());
if (shard != engine().cpu_id()) {
if (shard != this_shard_id()) {
proxy.get_stats().replica_cross_shard_ops++;
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
@@ -386,7 +387,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
return proxy.cas(schema, request, request->read_command(), request->key(),
{read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
cl_for_paxos, cl_for_commit, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
cl_for_paxos, cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
return modification_statement::build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows());
});
}
@@ -400,9 +401,9 @@ void batch_statement::build_cas_result_set_metadata() {
_columns_of_cas_result_set.resize(schema.all_columns_count());
// Add the mandatory [applied] column to result set metadata
std::vector<shared_ptr<column_specification>> columns;
std::vector<lw_shared_ptr<column_specification>> columns;
auto applied = ::make_shared<cql3::column_specification>(schema.ks_name(), schema.cf_name(),
auto applied = make_lw_shared<cql3::column_specification>(schema.ks_name(), schema.cf_name(),
::make_shared<cql3::column_identifier>("[applied]", false), boolean_type);
columns.push_back(applied);
@@ -448,7 +449,6 @@ batch_statement::prepare(database& db, cql_stats& stats) {
prep_attrs->collect_marker_specification(bound_names);
cql3::statements::batch_statement batch_statement_(bound_names.size(), _type, std::move(statements), std::move(prep_attrs), stats);
batch_statement_.validate();
std::vector<uint16_t> partition_key_bind_indices;
if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {

View File

@@ -175,9 +175,9 @@ bool cas_request::applies_to() const {
return applies;
}
std::optional<mutation> cas_request::apply(query::result& qr,
std::optional<mutation> cas_request::apply(foreign_ptr<lw_shared_ptr<query::result>> qr,
const query::partition_slice& slice, api::timestamp_type ts) {
_rows = update_parameters::build_prefetch_data(_schema, qr, slice);
_rows = update_parameters::build_prefetch_data(_schema, *qr, slice);
if (applies_to()) {
return apply_updates(ts);
} else {

View File

@@ -95,7 +95,7 @@ public:
void add_row_update(const modification_statement& stmt_arg, std::vector<query::clustering_range> ranges_arg,
modification_statement::json_cache_opt json_cache_arg, const query_options& options_arg);
virtual std::optional<mutation> apply(query::result& qr,
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr,
const query::partition_slice& slice, api::timestamp_type ts) override;
private:

View File

@@ -255,7 +255,9 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
}
}
builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));
if (has_property(KW_DEFAULT_TIME_TO_LIVE)) {
builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));
}
if (has_property(KW_SPECULATIVE_RETRY)) {
builder.set_speculative_retry(get_string(KW_SPECULATIVE_RETRY, builder.get_speculative_retry().to_sstring()));

View File

@@ -54,10 +54,6 @@ namespace statements {
class create_role_statement final : public authentication_statement {
sstring _role;
bool _is_superuser;
bool _can_login;
role_options _options;
bool _if_not_exists;
@@ -70,6 +66,8 @@ public:
, _if_not_exists(if_not_exists) {
}
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
future<> grant_permissions_to_creator(const service::client_state&) const;
void validate(service::storage_proxy&, const service::client_state&) const override;

View File

@@ -135,7 +135,7 @@ schema_ptr create_table_statement::get_cf_meta_data(const database& db) const {
void create_table_statement::apply_properties_to(schema_builder& builder, const database& db) const {
auto&& columns = get_columns();
for (auto&& column : columns) {
builder.with_column(column);
builder.with_column_ordered(column);
}
#if 0
cfmd.defaultValidator(defaultValidator)

View File

@@ -60,6 +60,8 @@ public:
drop_role_statement(const cql3::role_name& name, bool if_exists) : _role(name.to_string()), _if_exists(if_exists) {
}
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
virtual void validate(service::storage_proxy&, const service::client_state&) const override;
virtual future<> check_access(service::storage_proxy& proxy, const service::client_state&) const override;

View File

@@ -61,6 +61,8 @@ public:
: _role(name.to_string()), _grantee(grantee.to_string()) {
}
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
virtual future<> check_access(service::storage_proxy& proxy, const service::client_state&) const override;
virtual future<::shared_ptr<cql_transport::messages::result_message>>

View File

@@ -42,6 +42,11 @@
#include "grant_statement.hh"
#include "auth/authorizer.hh"
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::grant_statement::prepare(
database& db, cql_stats& stats) {
return std::make_unique<prepared_statement>(::make_shared<grant_statement>(*this));
}
future<::shared_ptr<cql_transport::messages::result_message>>
cql3::statements::grant_statement::execute(service::storage_proxy& proxy, service::query_state& state, const query_options& options) const {
auto& auth_service = *state.get_client_state().get_auth_service();

View File

@@ -51,6 +51,8 @@ class grant_statement : public permission_altering_statement {
public:
using permission_altering_statement::permission_altering_statement;
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
future<::shared_ptr<cql_transport::messages::result_message>> execute(service::storage_proxy&
, service::query_state&
, const query_options&) const override;

View File

@@ -58,6 +58,11 @@ cql3::statements::list_permissions_statement::list_permissions_statement(
, _recursive(recursive) {
}
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::list_permissions_statement::prepare(
database& db, cql_stats& stats) {
return std::make_unique<prepared_statement>(::make_shared<list_permissions_statement>(*this));
}
void cql3::statements::list_permissions_statement::validate(
service::storage_proxy& proxy,
const service::client_state& state) const {
@@ -105,14 +110,14 @@ cql3::statements::list_permissions_statement::execute(
service::query_state& state,
const query_options& options) const {
static auto make_column = [](sstring name) {
return ::make_shared<column_specification>(
return make_lw_shared<column_specification>(
auth::meta::AUTH_KS,
"permissions",
::make_shared<column_identifier>(std::move(name), true),
utf8_type);
};
static thread_local const std::vector<::shared_ptr<column_specification>> metadata({
static thread_local const std::vector<lw_shared_ptr<column_specification>> metadata({
make_column("role"), make_column("username"), make_column("resource"), make_column("permission")
});

View File

@@ -61,6 +61,8 @@ private:
public:
list_permissions_statement(auth::permission_set, std::optional<auth::resource>, std::optional<sstring>, bool);
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
void validate(service::storage_proxy&, const service::client_state&) const override;
future<> check_access(service::storage_proxy& proxy, const service::client_state&) const override;

View File

@@ -62,6 +62,8 @@ public:
list_roles_statement(const std::optional<role_name>& grantee, bool recursive)
: _grantee(grantee ? sstring(grantee->to_string()) : std::optional<sstring>()), _recursive(recursive) {}
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
virtual future<> check_access(service::storage_proxy& proxy, const service::client_state&) const override;
virtual future<::shared_ptr<cql_transport::messages::result_message>>

View File

@@ -45,6 +45,11 @@
#include "auth/common.hh"
#include "transport/messages/result_message.hh"
std::unique_ptr<cql3::statements::prepared_statement> cql3::statements::list_users_statement::prepare(
database& db, cql_stats& stats) {
return std::make_unique<prepared_statement>(::make_shared<list_users_statement>(*this));
}
void cql3::statements::list_users_statement::validate(service::storage_proxy& proxy, const service::client_state& state) const {
}
@@ -58,7 +63,7 @@ cql3::statements::list_users_statement::execute(service::storage_proxy& proxy, s
static const sstring virtual_table_name("users");
static const auto make_column_spec = [](const sstring& name, const ::shared_ptr<const abstract_type>& ty) {
return ::make_shared<column_specification>(
return make_lw_shared<column_specification>(
auth::meta::AUTH_KS,
virtual_table_name,
::make_shared<column_identifier>(name, true),
@@ -66,7 +71,7 @@ cql3::statements::list_users_statement::execute(service::storage_proxy& proxy, s
};
static thread_local const auto metadata = ::make_shared<cql3::metadata>(
std::vector<::shared_ptr<column_specification>>{
std::vector<lw_shared_ptr<column_specification>>{
make_column_spec("name", utf8_type),
make_column_spec("super", boolean_type)});

View File

@@ -49,6 +49,9 @@ namespace statements {
class list_users_statement : public authentication_statement {
public:
std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
void validate(service::storage_proxy&, const service::client_state&) const override;
future<> check_access(service::storage_proxy& proxy, const service::client_state&) const override;
future<::shared_ptr<cql_transport::messages::result_message>> execute(service::storage_proxy&

View File

@@ -213,7 +213,7 @@ bool modification_statement::applies_to(const update_parameters::prefetch_data::
return row == nullptr;
}
auto condition_applies = [&row, &options](const shared_ptr<column_condition>& cond) {
auto condition_applies = [&row, &options](const lw_shared_ptr<column_condition>& cond) {
const data_value* value = nullptr;
if (row != nullptr) {
auto it = row->cells.find(cond->column.ordinal_id);
@@ -322,7 +322,7 @@ modification_statement::execute_without_condition(service::storage_proxy& proxy,
future<::shared_ptr<cql_transport::messages::result_message>>
modification_statement::execute_with_condition(service::storage_proxy& proxy, service::query_state& qs, const query_options& options) const {
auto cl_for_commit = options.get_consistency();
auto cl_for_learn = options.get_consistency();
auto cl_for_paxos = options.check_serial_consistency();
db::timeout_clock::time_point now = db::timeout_clock::now();
const timeout_config& cfg = options.get_timeout_config();
@@ -346,7 +346,7 @@ modification_statement::execute_with_condition(service::storage_proxy& proxy, se
request->add_row_update(*this, std::move(ranges), std::move(json_cache), options);
auto shard = service::storage_proxy::cas_shard(*s, request->key()[0].start()->value().as_decorated_key().token());
if (shard != engine().cpu_id()) {
if (shard != this_shard_id()) {
proxy.get_stats().replica_cross_shard_ops++;
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
@@ -354,7 +354,7 @@ modification_statement::execute_with_condition(service::storage_proxy& proxy, se
return proxy.cas(s, request, request->read_command(), request->key(),
{read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
cl_for_paxos, cl_for_commit, statement_timeout, cas_timeout).then([this, request] (bool is_applied) {
cl_for_paxos, cl_for_learn, statement_timeout, cas_timeout).then([this, request] (bool is_applied) {
return build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows());
});
}
@@ -408,9 +408,9 @@ modification_statement::build_cas_result_set(seastar::shared_ptr<cql3::metadata>
void modification_statement::build_cas_result_set_metadata() {
std::vector<shared_ptr<column_specification>> columns;
std::vector<lw_shared_ptr<column_specification>> columns;
// Add the mandatory [applied] column to result set metadata
auto applied = seastar::make_shared<cql3::column_specification>(s->ks_name(), s->cf_name(),
auto applied = make_lw_shared<cql3::column_specification>(s->ks_name(), s->cf_name(),
make_shared<cql3::column_identifier>("[applied]", false), boolean_type);
columns.push_back(applied);
@@ -647,7 +647,7 @@ void modification_statement::inc_cql_stats(bool is_internal) const {
++_stats.query_cnt(src_sel, _ks_sel, cond_sel, type);
}
void modification_statement::add_condition(::shared_ptr<column_condition> cond) {
void modification_statement::add_condition(lw_shared_ptr<column_condition> cond) {
if (cond->column.is_static()) {
_has_static_column_conditions = true;
_static_conditions.emplace_back(std::move(cond));

Some files were not shown because too many files have changed in this diff Show More