This will allow expressing lack of information about certain ranges of
rows (including the static row), which will be used in cache to
determine if information in cache is complete or not.
Continuity is represented internally using flags on row entries. The
key range between two consecutive entries is continuous iff
rows_entry::continuous() is true for the later entry. The range
starting after the last entry is assumed to be continuous. The range
corresponding to the key of the entry is continuous iff
rows_entry::dummy() is false.
[tgrabiec:
- based on the following commits:
4a5bf75 - Piotr Jastrzebski : mutation_partition: introduce dummy rows_entry
773070e - Piotr Jastrzebski : mutation_partition: add continuity flag to rows_entry
- documented that partition tombstone is always complete
- require specifying the partition tombstone when creating an incomplete entry
- replaced rows_entry(dummy_tag, ...) constructor with more general
rows_entry(position_in_partition, ...)
- documented continuity semantics on mutation_partition
- fixed _static_row_cached being lost by mutation_partition copy constructors
- fixed conversion to streamed_mutation to ignore dummy entries
- fixed mutation_partition serializer to drop dummy entries
- documented semantics of continuity on mutation_partition level
- dropped assumptions that dummy entries can be only at the last position
- changed equality to ignore continuity completely, rather than
partially (it was not ignoring dummy entries, but ignoring
continuity flag)
- added printout of continuity information in mutation_partition
- fixed handling of empty entries in apply_reversibly() with regards
to continuity; we no longer can remove empty entries before
merging, since that may affect continuity of the right-hand
mutation. Added _erased flag.
- fixed mutation_partition::clustered_row() with dummy==true to not ignore the key
- fixed partition_builder to not ignore continuity
- renamed dummy_tag_t to dummy_tag. _t suffix is reserved.
- standardized all APIs on is_dummy and is_continuous bool_class:es
- replaced add_dummy_entry() with ensure_last_dummy() with safer semantics
- dropped unused remove_dummy_entry()
- simplified and inlined cache_entry::add_dummy_entry()
- fixed mutation_partition(incomplete_tag) constructor to mark all row ranges as discontinuous
]
455 lines
18 KiB
C++
455 lines
18 KiB
C++
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
/*
|
|
* Modified by ScyllaDB
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "batch_statement.hh"
|
|
#include "raw/batch_statement.hh"
|
|
#include "db/config.hh"
|
|
#include <seastar/core/execution_stage.hh>
|
|
|
|
namespace {
|
|
|
|
struct mutation_equals_by_key {
|
|
bool operator()(const mutation& m1, const mutation& m2) const {
|
|
return m1.schema() == m2.schema()
|
|
&& m1.decorated_key().equal(*m1.schema(), m2.decorated_key());
|
|
}
|
|
};
|
|
|
|
struct mutation_hash_by_key {
|
|
size_t operator()(const mutation& m) const {
|
|
auto dk_hash = std::hash<dht::decorated_key>();
|
|
return dk_hash(m.decorated_key());
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
|
|
namespace cql3 {
|
|
|
|
namespace statements {
|
|
|
|
logging::logger batch_statement::_logger("BatchStatement");
|
|
|
|
batch_statement::batch_statement(int bound_terms, type type_,
|
|
std::vector<shared_ptr<modification_statement>> statements,
|
|
std::unique_ptr<attributes> attrs,
|
|
cql_stats& stats)
|
|
: _bound_terms(bound_terms), _type(type_), _statements(std::move(statements))
|
|
, _attrs(std::move(attrs))
|
|
, _has_conditions(boost::algorithm::any_of(_statements, std::mem_fn(&modification_statement::has_conditions)))
|
|
, _stats(stats)
|
|
{
|
|
}
|
|
|
|
batch_statement::batch_statement(type type_,
|
|
std::vector<shared_ptr<modification_statement>> statements,
|
|
std::unique_ptr<attributes> attrs,
|
|
cql_stats& stats)
|
|
: batch_statement(-1, type_, std::move(statements), std::move(attrs), stats)
|
|
{
|
|
}
|
|
|
|
bool batch_statement::uses_function(const sstring& ks_name, const sstring& function_name) const
|
|
{
|
|
return _attrs->uses_function(ks_name, function_name)
|
|
|| boost::algorithm::any_of(_statements, [&] (auto&& s) { return s->uses_function(ks_name, function_name); });
|
|
}
|
|
|
|
bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool batch_statement::depends_on_column_family(const sstring& cf_name) const
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint32_t batch_statement::get_bound_terms()
|
|
{
|
|
return _bound_terms;
|
|
}
|
|
|
|
future<> batch_statement::check_access(const service::client_state& state)
|
|
{
|
|
return parallel_for_each(_statements.begin(), _statements.end(), [&state](auto&& s) {
|
|
return s->check_access(state);
|
|
});
|
|
}
|
|
|
|
void batch_statement::validate()
|
|
{
|
|
if (_attrs->is_time_to_live_set()) {
|
|
throw exceptions::invalid_request_exception("Global TTL on the BATCH statement is not supported.");
|
|
}
|
|
|
|
bool timestamp_set = _attrs->is_timestamp_set();
|
|
if (timestamp_set) {
|
|
if (_has_conditions) {
|
|
throw exceptions::invalid_request_exception("Cannot provide custom timestamp for conditional BATCH");
|
|
}
|
|
if (_type == type::COUNTER) {
|
|
throw exceptions::invalid_request_exception("Cannot provide custom timestamp for counter BATCH");
|
|
}
|
|
}
|
|
|
|
bool has_counters = boost::algorithm::any_of(_statements, std::mem_fn(&modification_statement::is_counter));
|
|
bool has_non_counters = !boost::algorithm::all_of(_statements, std::mem_fn(&modification_statement::is_counter));
|
|
if (timestamp_set && has_counters) {
|
|
throw exceptions::invalid_request_exception("Cannot provide custom timestamp for a BATCH containing counters");
|
|
}
|
|
if (timestamp_set && boost::algorithm::any_of(_statements, std::mem_fn(&modification_statement::is_timestamp_set))) {
|
|
throw exceptions::invalid_request_exception("Timestamp must be set either on BATCH or individual statements");
|
|
}
|
|
if (_type == type::COUNTER && has_non_counters) {
|
|
throw exceptions::invalid_request_exception("Cannot include non-counter statement in a counter batch");
|
|
}
|
|
if (_type == type::LOGGED && has_counters) {
|
|
throw exceptions::invalid_request_exception("Cannot include a counter statement in a logged batch");
|
|
}
|
|
if (has_counters && has_non_counters) {
|
|
throw exceptions::invalid_request_exception("Counter and non-counter mutations cannot exist in the same batch");
|
|
}
|
|
|
|
if (_has_conditions
|
|
&& !_statements.empty()
|
|
&& (boost::distance(_statements
|
|
| boost::adaptors::transformed(std::mem_fn(&modification_statement::keyspace))
|
|
| boost::adaptors::uniqued) != 1
|
|
|| (boost::distance(_statements
|
|
| boost::adaptors::transformed(std::mem_fn(&modification_statement::column_family))
|
|
| boost::adaptors::uniqued) != 1))) {
|
|
throw exceptions::invalid_request_exception("Batch with conditions cannot span multiple tables");
|
|
}
|
|
std::experimental::optional<bool> raw_counter;
|
|
for (auto& s : _statements) {
|
|
if (raw_counter && s->is_raw_counter_shard_write() != *raw_counter) {
|
|
throw exceptions::invalid_request_exception("Cannot mix raw and regular counter statements in batch");
|
|
}
|
|
raw_counter = s->is_raw_counter_shard_write();
|
|
}
|
|
}
|
|
|
|
void batch_statement::validate(distributed<service::storage_proxy>& proxy, const service::client_state& state)
|
|
{
|
|
for (auto&& s : _statements) {
|
|
s->validate(proxy, state);
|
|
}
|
|
}
|
|
|
|
const std::vector<shared_ptr<modification_statement>>& batch_statement::get_statements()
|
|
{
|
|
return _statements;
|
|
}
|
|
|
|
future<std::vector<mutation>> batch_statement::get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now, tracing::trace_state_ptr trace_state) {
|
|
// Do not process in parallel because operations like list append/prepend depend on execution order.
|
|
using mutation_set_type = std::unordered_set<mutation, mutation_hash_by_key, mutation_equals_by_key>;
|
|
return do_with(mutation_set_type(), [this, &storage, &options, now, local, trace_state] (auto& result) {
|
|
result.reserve(_statements.size());
|
|
_stats.statements_in_batches += _statements.size();
|
|
return do_for_each(boost::make_counting_iterator<size_t>(0),
|
|
boost::make_counting_iterator<size_t>(_statements.size()),
|
|
[this, &storage, &options, now, local, &result, trace_state] (size_t i) {
|
|
auto&& statement = _statements[i];
|
|
statement->inc_cql_stats();
|
|
auto&& statement_options = options.for_statement(i);
|
|
auto timestamp = _attrs->get_timestamp(now, statement_options);
|
|
return statement->get_mutations(storage, statement_options, local, timestamp, trace_state).then([&result] (auto&& more) {
|
|
for (auto&& m : more) {
|
|
// We want unordered_set::try_emplace(), but we don't have it
|
|
auto pos = result.find(m);
|
|
if (pos == result.end()) {
|
|
result.emplace(std::move(m));
|
|
} else {
|
|
const_cast<mutation&>(*pos).apply(std::move(m)); // Won't change key
|
|
}
|
|
}
|
|
});
|
|
}).then([&result] {
|
|
// can't use range adaptors, because we want to move
|
|
auto vresult = std::vector<mutation>();
|
|
vresult.reserve(result.size());
|
|
for (auto&& m : result) {
|
|
vresult.push_back(std::move(m));
|
|
}
|
|
return vresult;
|
|
});
|
|
});
|
|
}
|
|
|
|
void batch_statement::verify_batch_size(const std::vector<mutation>& mutations) {
|
|
if (mutations.size() <= 1) {
|
|
return; // We only warn for batch spanning multiple mutations
|
|
}
|
|
|
|
size_t warn_threshold = service::get_local_storage_proxy().get_db().local().get_config().batch_size_warn_threshold_in_kb() * 1024;
|
|
size_t fail_threshold = service::get_local_storage_proxy().get_db().local().get_config().batch_size_fail_threshold_in_kb() * 1024;
|
|
|
|
class my_partition_visitor : public mutation_partition_visitor {
|
|
public:
|
|
void accept_partition_tombstone(tombstone) override {}
|
|
void accept_static_cell(column_id, atomic_cell_view v) override {
|
|
size += v.value().size();
|
|
}
|
|
void accept_static_cell(column_id, collection_mutation_view v) override {
|
|
size += v.data.size();
|
|
}
|
|
void accept_row_tombstone(const range_tombstone&) override {}
|
|
void accept_row(position_in_partition_view, const row_tombstone&, const row_marker&, is_dummy, is_continuous) override {}
|
|
void accept_row_cell(column_id, atomic_cell_view v) override {
|
|
size += v.value().size();
|
|
}
|
|
void accept_row_cell(column_id id, collection_mutation_view v) override {
|
|
size += v.data.size();
|
|
}
|
|
|
|
size_t size = 0;
|
|
};
|
|
|
|
my_partition_visitor v;
|
|
|
|
for (auto&m : mutations) {
|
|
m.partition().accept(*m.schema(), v);
|
|
}
|
|
|
|
if (v.size > warn_threshold) {
|
|
auto error = [&] (const char* type, size_t threshold) -> sstring {
|
|
std::unordered_set<sstring> ks_cf_pairs;
|
|
for (auto&& m : mutations) {
|
|
ks_cf_pairs.insert(m.schema()->ks_name() + "." + m.schema()->cf_name());
|
|
}
|
|
return sprint("Batch of prepared statements for %s is of size %d, exceeding specified %s threshold of %d by %d.",
|
|
join(", ", ks_cf_pairs), v.size, type, threshold, v.size - threshold);
|
|
};
|
|
if (v.size > fail_threshold) {
|
|
_logger.error(error("FAIL", fail_threshold).c_str());
|
|
throw exceptions::invalid_request_exception("Batch too large");
|
|
} else {
|
|
_logger.warn(error("WARN", warn_threshold).c_str());
|
|
}
|
|
}
|
|
}
|
|
|
|
struct batch_statement_executor {
|
|
static auto get() { return &batch_statement::do_execute; }
|
|
};
|
|
static thread_local auto batch_stage = seastar::make_execution_stage("cql3_batch", batch_statement_executor::get());
|
|
|
|
future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute(
|
|
distributed<service::storage_proxy>& storage, service::query_state& state, const query_options& options) {
|
|
++_stats.batches;
|
|
return batch_stage(this, seastar::ref(storage), seastar::ref(state),
|
|
seastar::cref(options), false, options.get_timestamp(state));
|
|
}
|
|
|
|
future<shared_ptr<cql_transport::messages::result_message>> batch_statement::do_execute(
|
|
distributed<service::storage_proxy>& storage,
|
|
service::query_state& query_state, const query_options& options,
|
|
bool local, api::timestamp_type now)
|
|
{
|
|
// FIXME: we don't support nulls here
|
|
#if 0
|
|
if (options.get_consistency() == null)
|
|
throw new InvalidRequestException("Invalid empty consistency level");
|
|
if (options.getSerialConsistency() == null)
|
|
throw new InvalidRequestException("Invalid empty serial consistency level");
|
|
#endif
|
|
if (_has_conditions) {
|
|
return execute_with_conditions(storage, options, query_state);
|
|
}
|
|
|
|
return get_mutations(storage, options, local, now, query_state.get_trace_state()).then([this, &storage, &options, tr_state = query_state.get_trace_state()] (std::vector<mutation> ms) mutable {
|
|
return execute_without_conditions(storage, std::move(ms), options.get_consistency(), std::move(tr_state));
|
|
}).then([] {
|
|
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(
|
|
make_shared<cql_transport::messages::result_message::void_message>());
|
|
});
|
|
}
|
|
|
|
future<> batch_statement::execute_without_conditions(
|
|
distributed<service::storage_proxy>& storage,
|
|
std::vector<mutation> mutations,
|
|
db::consistency_level cl,
|
|
tracing::trace_state_ptr tr_state)
|
|
{
|
|
// FIXME: do we need to do this?
|
|
#if 0
|
|
// Extract each collection of cfs from it's IMutation and then lazily concatenate all of them into a single Iterable.
|
|
Iterable<ColumnFamily> cfs = Iterables.concat(Iterables.transform(mutations, new Function<IMutation, Collection<ColumnFamily>>()
|
|
{
|
|
public Collection<ColumnFamily> apply(IMutation im)
|
|
{
|
|
return im.getColumnFamilies();
|
|
}
|
|
}));
|
|
#endif
|
|
verify_batch_size(mutations);
|
|
|
|
bool mutate_atomic = true;
|
|
if (_type != type::LOGGED) {
|
|
_stats.batches_pure_unlogged += 1;
|
|
mutate_atomic = false;
|
|
} else {
|
|
if (mutations.size() > 1) {
|
|
_stats.batches_pure_logged += 1;
|
|
} else {
|
|
_stats.batches_unlogged_from_logged += 1;
|
|
mutate_atomic = false;
|
|
}
|
|
}
|
|
return storage.local().mutate_with_triggers(std::move(mutations), cl, mutate_atomic, std::move(tr_state));
|
|
}
|
|
|
|
future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_with_conditions(
|
|
distributed<service::storage_proxy>& storage,
|
|
const query_options& options,
|
|
service::query_state& state)
|
|
{
|
|
fail(unimplemented::cause::LWT);
|
|
#if 0
|
|
auto now = state.get_timestamp();
|
|
ByteBuffer key = null;
|
|
String ksName = null;
|
|
String cfName = null;
|
|
CQL3CasRequest casRequest = null;
|
|
Set<ColumnDefinition> columnsWithConditions = new LinkedHashSet<>();
|
|
|
|
for (int i = 0; i < statements.size(); i++)
|
|
{
|
|
ModificationStatement statement = statements.get(i);
|
|
QueryOptions statementOptions = options.forStatement(i);
|
|
long timestamp = attrs.getTimestamp(now, statementOptions);
|
|
List<ByteBuffer> pks = statement.buildPartitionKeyNames(statementOptions);
|
|
if (pks.size() > 1)
|
|
throw new IllegalArgumentException("Batch with conditions cannot span multiple partitions (you cannot use IN on the partition key)");
|
|
if (key == null)
|
|
{
|
|
key = pks.get(0);
|
|
ksName = statement.cfm.ksName;
|
|
cfName = statement.cfm.cfName;
|
|
casRequest = new CQL3CasRequest(statement.cfm, key, true);
|
|
}
|
|
else if (!key.equals(pks.get(0)))
|
|
{
|
|
throw new InvalidRequestException("Batch with conditions cannot span multiple partitions");
|
|
}
|
|
|
|
Composite clusteringPrefix = statement.createClusteringPrefix(statementOptions);
|
|
if (statement.hasConditions())
|
|
{
|
|
statement.addConditions(clusteringPrefix, casRequest, statementOptions);
|
|
// As soon as we have a ifNotExists, we set columnsWithConditions to null so that everything is in the resultSet
|
|
if (statement.hasIfNotExistCondition() || statement.hasIfExistCondition())
|
|
columnsWithConditions = null;
|
|
else if (columnsWithConditions != null)
|
|
Iterables.addAll(columnsWithConditions, statement.getColumnsWithConditions());
|
|
}
|
|
casRequest.addRowUpdate(clusteringPrefix, statement, statementOptions, timestamp);
|
|
}
|
|
|
|
ColumnFamily result = StorageProxy.cas(ksName, cfName, key, casRequest, options.getSerialConsistency(), options.getConsistency(), state.getClientState());
|
|
|
|
return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(ksName, key, cfName, result, columnsWithConditions, true, options.forStatement(0)));
|
|
#endif
|
|
}
|
|
|
|
future<shared_ptr<cql_transport::messages::result_message>> batch_statement::execute_internal(
|
|
distributed<service::storage_proxy>& proxy,
|
|
service::query_state& query_state, const query_options& options)
|
|
{
|
|
throw std::runtime_error(sprint("%s not implemented", __PRETTY_FUNCTION__));
|
|
#if 0
|
|
assert !hasConditions;
|
|
for (IMutation mutation : getMutations(BatchQueryOptions.withoutPerStatementVariables(options), true, queryState.getTimestamp()))
|
|
{
|
|
// We don't use counters internally.
|
|
assert mutation instanceof Mutation;
|
|
((Mutation) mutation).apply();
|
|
}
|
|
return null;
|
|
#endif
|
|
}
|
|
|
|
namespace raw {
|
|
|
|
std::unique_ptr<prepared_statement>
|
|
batch_statement::prepare(database& db, cql_stats& stats) {
|
|
auto&& bound_names = get_bound_variables();
|
|
|
|
stdx::optional<sstring> first_ks;
|
|
stdx::optional<sstring> first_cf;
|
|
bool have_multiple_cfs = false;
|
|
|
|
std::vector<shared_ptr<cql3::statements::modification_statement>> statements;
|
|
for (auto&& parsed : _parsed_statements) {
|
|
if (!first_ks) {
|
|
first_ks = parsed->keyspace();
|
|
first_cf = parsed->column_family();
|
|
} else {
|
|
have_multiple_cfs = first_ks.value() != parsed->keyspace() || first_cf.value() != parsed->column_family();
|
|
}
|
|
statements.push_back(parsed->prepare(db, bound_names, stats));
|
|
}
|
|
|
|
auto&& prep_attrs = _attrs->prepare(db, "[batch]", "[batch]");
|
|
prep_attrs->collect_marker_specification(bound_names);
|
|
|
|
cql3::statements::batch_statement batch_statement_(bound_names->size(), _type, std::move(statements), std::move(prep_attrs), stats);
|
|
batch_statement_.validate();
|
|
|
|
std::vector<uint16_t> partition_key_bind_indices;
|
|
if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {
|
|
partition_key_bind_indices = bound_names->get_partition_key_bind_indexes(batch_statement_.get_statements()[0]->s);
|
|
}
|
|
return std::make_unique<prepared>(make_shared(std::move(batch_statement_)),
|
|
bound_names->get_specifications(),
|
|
std::move(partition_key_bind_indices));
|
|
}
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|