Compare commits
141 Commits
next
...
scylla-1.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec3ace5aa3 | ||
|
|
1f2d1012be | ||
|
|
9c8cfd3c0e | ||
|
|
d9e4ab38f6 | ||
|
|
5d72e96ccc | ||
|
|
a072df0e09 | ||
|
|
8291ec13fa | ||
|
|
c485551488 | ||
|
|
b85164bc1d | ||
|
|
11d7f83d52 | ||
|
|
5c4a24c1c0 | ||
|
|
306eeedf3e | ||
|
|
9eada540d9 | ||
|
|
a662765087 | ||
|
|
192f89bc6f | ||
|
|
b16bb0c299 | ||
|
|
cd9d967c44 | ||
|
|
236b089b03 | ||
|
|
9d54b33644 | ||
|
|
4ef6c3155e | ||
|
|
fe529606ae | ||
|
|
85376ce555 | ||
|
|
5e8ac82614 | ||
|
|
22c8520d61 | ||
|
|
e7355c9b60 | ||
|
|
3f54e0c28e | ||
|
|
4c6f8f9d85 | ||
|
|
7a76157cb9 | ||
|
|
b2e6a52461 | ||
|
|
b1376fef9b | ||
|
|
23f4813a48 | ||
|
|
c16c3127fe | ||
|
|
48fdeb47e2 | ||
|
|
9ef4006d67 | ||
|
|
75c53e4f24 | ||
|
|
66292c0ef0 | ||
|
|
84f7d9a49c | ||
|
|
f0535eae9b | ||
|
|
4f096b60df | ||
|
|
4ac160f2fe | ||
|
|
395edc4361 | ||
|
|
e2c9feafa3 | ||
|
|
f4dea17c19 | ||
|
|
a45b72b66f | ||
|
|
be1c2a875b | ||
|
|
0b9f83c6b6 | ||
|
|
0d77615b80 | ||
|
|
8771220745 | ||
|
|
f552a62169 | ||
|
|
696a978611 | ||
|
|
0475a98de1 | ||
|
|
0b69e37065 | ||
|
|
dc6be68852 | ||
|
|
8c20741150 | ||
|
|
3e3eaa693c | ||
|
|
03ef0a9231 | ||
|
|
47bf8181af | ||
|
|
8d542221eb | ||
|
|
c0e387e1ac | ||
|
|
57d3dc5c66 | ||
|
|
2daee0b62d | ||
|
|
3eddf5ac54 | ||
|
|
42d6f389f9 | ||
|
|
1a6f6f1605 | ||
|
|
8d8e997f5a | ||
|
|
50ee889679 | ||
|
|
325f917d8a | ||
|
|
b088dd7d9e | ||
|
|
a42b2bb0d6 | ||
|
|
aecda01f8e | ||
|
|
f9b0a29def | ||
|
|
192e935832 | ||
|
|
436ff3488a | ||
|
|
b91712fc36 | ||
|
|
be954ccaec | ||
|
|
2bffa8af74 | ||
|
|
4a6d0d503f | ||
|
|
fa81385469 | ||
|
|
93981aaa93 | ||
|
|
89b40f54db | ||
|
|
99dfbedf36 | ||
|
|
e95f4eaee4 | ||
|
|
2570da2006 | ||
|
|
b224ff6ede | ||
|
|
6960fce9b2 | ||
|
|
a556265ccd | ||
|
|
8243d3d1e0 | ||
|
|
2665bfdc93 | ||
|
|
3a1e8fffde | ||
|
|
23c340bed8 | ||
|
|
ff8a795021 | ||
|
|
d11b0cac3b | ||
|
|
5ad0448cc9 | ||
|
|
35ab2cadc2 | ||
|
|
a1cee9f97c | ||
|
|
0ae7347d8e | ||
|
|
b04168c015 | ||
|
|
4e13853cbc | ||
|
|
503f6c6755 | ||
|
|
7d73599acd | ||
|
|
bf27379583 | ||
|
|
02cf5a517a | ||
|
|
ec3d59bf13 | ||
|
|
30c72ef3b4 | ||
|
|
15e69a32ba | ||
|
|
4e43cb84ff | ||
|
|
07d5e939be | ||
|
|
a2a5a22504 | ||
|
|
a39bec0e24 | ||
|
|
f0af5719d5 | ||
|
|
0523000af5 | ||
|
|
69a0e6e002 | ||
|
|
58d4de295c | ||
|
|
026061733f | ||
|
|
1d7ed190f8 | ||
|
|
2d66a4621a | ||
|
|
aaa9b5ace8 | ||
|
|
8d491e9879 | ||
|
|
b63c9fb84b | ||
|
|
b229f03198 | ||
|
|
6caa59560b | ||
|
|
79196af9fb | ||
|
|
afe09da858 | ||
|
|
d6cb41ff24 | ||
|
|
6bf77c7b49 | ||
|
|
6d34b4dab7 | ||
|
|
d367f1e9ab | ||
|
|
75a36ae453 | ||
|
|
35c1781913 | ||
|
|
1489b28ffd | ||
|
|
f975653c94 | ||
|
|
96f5cbb604 | ||
|
|
66ebef7d10 | ||
|
|
789fb0db97 | ||
|
|
af7c0f6433 | ||
|
|
aaf6786997 | ||
|
|
e8cb163cdf | ||
|
|
2d7c322805 | ||
|
|
13f18c6445 | ||
|
|
9c430c2cff | ||
|
|
c84e030fe9 |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=666.development
|
||||
VERSION=1.3.0
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -219,8 +219,9 @@ static future<json::json_return_type> sum_sstable(http_context& ctx, const sstr
|
||||
auto uuid = get_uuid(name, ctx.db.local());
|
||||
return ctx.db.map_reduce0([uuid, total](database& db) {
|
||||
std::unordered_map<sstring, uint64_t> m;
|
||||
for (auto t :*((total) ? db.find_column_family(uuid).get_sstables_including_compacted_undeleted() :
|
||||
db.find_column_family(uuid).get_sstables()).get()) {
|
||||
auto sstables = (total) ? db.find_column_family(uuid).get_sstables_including_compacted_undeleted() :
|
||||
db.find_column_family(uuid).get_sstables();
|
||||
for (auto t : *sstables) {
|
||||
m[t->get_filename()] = t->bytes_on_disk();
|
||||
}
|
||||
return m;
|
||||
@@ -234,8 +235,9 @@ static future<json::json_return_type> sum_sstable(http_context& ctx, const sstr
|
||||
static future<json::json_return_type> sum_sstable(http_context& ctx, bool total) {
|
||||
return map_reduce_cf_raw(ctx, std::unordered_map<sstring, uint64_t>(), [total](column_family& cf) {
|
||||
std::unordered_map<sstring, uint64_t> m;
|
||||
for (auto t :*((total) ? cf.get_sstables_including_compacted_undeleted() :
|
||||
cf.get_sstables()).get()) {
|
||||
auto sstables = (total) ? cf.get_sstables_including_compacted_undeleted() :
|
||||
cf.get_sstables();
|
||||
for (auto t : *sstables) {
|
||||
m[t->get_filename()] = t->bytes_on_disk();
|
||||
}
|
||||
return m;
|
||||
|
||||
@@ -28,7 +28,11 @@ class checked_file_impl : public file_impl {
|
||||
public:
|
||||
|
||||
checked_file_impl(disk_error_signal_type& s, file f)
|
||||
: _signal(s) , _file(f) {}
|
||||
: _signal(s) , _file(f) {
|
||||
_memory_dma_alignment = f.memory_dma_alignment();
|
||||
_disk_read_dma_alignment = f.disk_read_dma_alignment();
|
||||
_disk_write_dma_alignment = f.disk_write_dma_alignment();
|
||||
}
|
||||
|
||||
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
return do_io_check(_signal, [&] {
|
||||
|
||||
@@ -60,6 +60,10 @@ public:
|
||||
virtual const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key& key) override {
|
||||
return _ranges;
|
||||
}
|
||||
|
||||
virtual bool want_static_columns(const partition_key& key) override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class partition_slice_clustering_key_filter_factory : public clustering_key_filter_factory {
|
||||
@@ -95,6 +99,10 @@ public:
|
||||
}
|
||||
return _slice.row_ranges(*_schema, key);
|
||||
}
|
||||
|
||||
virtual bool want_static_columns(const partition_key& key) override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static const shared_ptr<clustering_key_filter_factory>
|
||||
|
||||
@@ -46,6 +46,9 @@ public:
|
||||
// Create a clustering key filter that can be used for multiple clustering keys but they have to be sorted.
|
||||
virtual clustering_key_filter get_filter_for_sorted(const partition_key&) = 0;
|
||||
virtual const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key&) = 0;
|
||||
// Whether we want to get the static row, in addition to the desired clustering rows
|
||||
virtual bool want_static_columns(const partition_key&) = 0;
|
||||
|
||||
virtual ~clustering_key_filter_factory() = default;
|
||||
};
|
||||
|
||||
@@ -65,6 +68,10 @@ public:
|
||||
}
|
||||
const std::vector<range<clustering_key_prefix>>& get_ranges(const partition_key& key) const;
|
||||
|
||||
bool want_static_columns(const partition_key& key) const {
|
||||
return _factory ? _factory->want_static_columns(key) : true;
|
||||
}
|
||||
|
||||
static const clustering_key_filtering_context create(schema_ptr, const partition_slice&);
|
||||
|
||||
static clustering_key_filtering_context create_no_filtering();
|
||||
|
||||
@@ -414,8 +414,12 @@ public:
|
||||
return _bytes.empty();
|
||||
}
|
||||
|
||||
static bool is_static(bytes_view bytes, bool is_compound) {
|
||||
return is_compound && bytes.size() > 2 && (bytes[0] & bytes[1] & 0xff) == 0xff;
|
||||
}
|
||||
|
||||
bool is_static() const {
|
||||
return size() > 2 && (_bytes.at(0) & _bytes.at(1) & 0xff) == 0xff;
|
||||
return is_static(_bytes, _is_compound);
|
||||
}
|
||||
|
||||
bool is_compound() const {
|
||||
@@ -514,7 +518,7 @@ public:
|
||||
}
|
||||
|
||||
bool is_static() const {
|
||||
return size() > 2 && (_bytes.at(0) & _bytes.at(1)) == 0xff;
|
||||
return composite::is_static(_bytes, _is_compound);
|
||||
}
|
||||
|
||||
explicit operator bytes_view() const {
|
||||
|
||||
2
conf/housekeeping.cfg
Normal file
2
conf/housekeeping.cfg
Normal file
@@ -0,0 +1,2 @@
|
||||
[housekeeping]
|
||||
check-version: True
|
||||
@@ -784,7 +784,7 @@ commitlog_total_space_in_mb: -1
|
||||
# can be: all - all traffic is compressed
|
||||
# dc - traffic between different datacenters is compressed
|
||||
# none - nothing is compressed.
|
||||
# internode_compression: all
|
||||
# internode_compression: none
|
||||
|
||||
# Enable or disable tcp_nodelay for inter-dc communication.
|
||||
# Disabling it will result in larger (but fewer) network packets being sent,
|
||||
|
||||
@@ -35,7 +35,7 @@ class converting_mutation_partition_applier : public mutation_partition_visitor
|
||||
deletable_row* _current_row;
|
||||
private:
|
||||
static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
|
||||
return new_def.kind == kind && new_def.type->is_value_compatible_with(*old_type);
|
||||
return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
|
||||
}
|
||||
void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
|
||||
if (is_compatible(new_def, old_type, kind) && cell.timestamp() > new_def.dropped_at()) {
|
||||
|
||||
@@ -92,9 +92,9 @@ void cf_prop_defs::validate() {
|
||||
throw exceptions::configuration_exception(sstring("Missing sub-option '") + COMPACTION_STRATEGY_CLASS_KEY + "' for the '" + KW_COMPACTION + "' option.");
|
||||
}
|
||||
_compaction_strategy_class = sstables::compaction_strategy::type(strategy->second);
|
||||
#if 0
|
||||
compactionOptions.remove(COMPACTION_STRATEGY_CLASS_KEY);
|
||||
remove_from_map_if_exists(KW_COMPACTION, COMPACTION_STRATEGY_CLASS_KEY);
|
||||
|
||||
#if 0
|
||||
CFMetaData.validateCompactionOptions(compactionStrategyClass, compactionOptions);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -181,6 +181,21 @@ long property_definitions::to_long(sstring key, std::experimental::optional<sstr
|
||||
}
|
||||
}
|
||||
|
||||
void property_definitions::remove_from_map_if_exists(const sstring& name, const sstring& key)
|
||||
{
|
||||
auto it = _properties.find(name);
|
||||
if (it == _properties.end()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
auto map = boost::any_cast<std::map<sstring, sstring>>(it->second);
|
||||
map.erase(key);
|
||||
_properties[name] = map;
|
||||
} catch (const boost::bad_any_cast& e) {
|
||||
throw exceptions::syntax_exception(sprint("Invalid value for property '%s'. It should be a map.", name));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -79,6 +79,7 @@ protected:
|
||||
|
||||
std::experimental::optional<std::map<sstring, sstring>> get_map(const sstring& name) const;
|
||||
|
||||
void remove_from_map_if_exists(const sstring& name, const sstring& key);
|
||||
public:
|
||||
bool has_property(const sstring& name) const;
|
||||
|
||||
|
||||
10
database.cc
10
database.cc
@@ -127,7 +127,7 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog* cl
|
||||
, _streaming_memtables(_config.enable_disk_writes ? make_streaming_memtable_list() : make_memory_only_memtable_list())
|
||||
, _compaction_strategy(make_compaction_strategy(_schema->compaction_strategy(), _schema->compaction_strategy_options()))
|
||||
, _sstables(make_lw_shared(_compaction_strategy.make_sstable_set(_schema)))
|
||||
, _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
|
||||
, _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker(), _config.max_cached_partition_size_in_bytes)
|
||||
, _commitlog(cl)
|
||||
, _compaction_manager(compaction_manager)
|
||||
, _flush_queue(std::make_unique<memtable_flush_queue>())
|
||||
@@ -785,7 +785,7 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
|
||||
future<>
|
||||
column_family::seal_active_memtable(memtable_list::flush_behavior ignored) {
|
||||
auto old = _memtables->back();
|
||||
dblog.debug("Sealing active memtable, partitions: {}, occupancy: {}", old->partition_count(), old->occupancy());
|
||||
dblog.debug("Sealing active memtable of {}.{}, partitions: {}, occupancy: {}", _schema->cf_name(), _schema->ks_name(), old->partition_count(), old->occupancy());
|
||||
|
||||
if (old->empty()) {
|
||||
dblog.debug("Memtable is empty");
|
||||
@@ -1581,7 +1581,7 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
|
||||
return parallel_for_each(tables.begin(), tables.end(), [this] (auto& t) {
|
||||
auto s = t.second;
|
||||
auto& ks = this->find_keyspace(s->ks_name());
|
||||
auto cfg = ks.make_column_family_config(*s);
|
||||
auto cfg = ks.make_column_family_config(*s, this->get_config());
|
||||
this->add_column_family(s, std::move(cfg));
|
||||
return ks.make_directory_for_column_family(s->cf_name(), s->id()).then([s] {});
|
||||
});
|
||||
@@ -1838,7 +1838,7 @@ void keyspace::update_from(::lw_shared_ptr<keyspace_metadata> ksm) {
|
||||
}
|
||||
|
||||
column_family::config
|
||||
keyspace::make_column_family_config(const schema& s) const {
|
||||
keyspace::make_column_family_config(const schema& s, const db::config& db_config) const {
|
||||
column_family::config cfg;
|
||||
cfg.datadir = column_family_directory(s.cf_name(), s.id());
|
||||
cfg.enable_disk_reads = _config.enable_disk_reads;
|
||||
@@ -1852,6 +1852,7 @@ keyspace::make_column_family_config(const schema& s) const {
|
||||
cfg.read_concurrency_config = _config.read_concurrency_config;
|
||||
cfg.cf_stats = _config.cf_stats;
|
||||
cfg.enable_incremental_backups = _config.enable_incremental_backups;
|
||||
cfg.max_cached_partition_size_in_bytes = db_config.max_cached_partition_size_in_kb() * 1024;
|
||||
|
||||
return cfg;
|
||||
}
|
||||
@@ -2502,6 +2503,7 @@ future<> update_schema_version_and_announce(distributed<service::storage_proxy>&
|
||||
return make_ready_future<>();
|
||||
}).then([uuid] {
|
||||
return db::system_keyspace::update_schema_version(uuid).then([uuid] {
|
||||
dblog.info("Schema version changed to {}", uuid);
|
||||
return service::get_local_migration_manager().passive_announce(uuid);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -316,6 +316,7 @@ public:
|
||||
::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
|
||||
restricted_mutation_reader_config read_concurrency_config;
|
||||
::cf_stats* cf_stats = nullptr;
|
||||
uint64_t max_cached_partition_size_in_bytes;
|
||||
};
|
||||
struct no_commitlog {};
|
||||
struct stats {
|
||||
@@ -884,7 +885,7 @@ public:
|
||||
*/
|
||||
locator::abstract_replication_strategy& get_replication_strategy();
|
||||
const locator::abstract_replication_strategy& get_replication_strategy() const;
|
||||
column_family::config make_column_family_config(const schema& s) const;
|
||||
column_family::config make_column_family_config(const schema& s, const db::config& db_config) const;
|
||||
future<> make_directory_for_column_family(const sstring& name, utils::UUID uuid);
|
||||
void add_or_update_column_family(const schema_ptr& s) {
|
||||
_metadata->add_or_update_column_family(s);
|
||||
|
||||
@@ -369,6 +369,9 @@ public:
|
||||
val(reduce_cache_sizes_at, double, .85, Invalid, \
|
||||
"When Java heap usage (after a full concurrent mark sweep (CMS) garbage collection) exceeds this percentage, Cassandra reduces the cache capacity to the fraction of the current size as specified by reduce_cache_capacity_to. To disable, set the value to 1.0." \
|
||||
) \
|
||||
val(max_cached_partition_size_in_kb, uint64_t, 10240uLL, Used, \
|
||||
"Partitions with size greater than this value won't be cached." \
|
||||
) \
|
||||
/* Disks settings */ \
|
||||
val(stream_throughput_outbound_megabits_per_sec, uint32_t, 400, Unused, \
|
||||
"Throttles all outbound streaming file transfers on a node to the specified throughput. Cassandra does mostly sequential I/O when streaming data during bootstrap or repair, which can lead to saturating the network connection and degrading client (RPC) performance." \
|
||||
@@ -556,7 +559,7 @@ public:
|
||||
val(rpc_port, uint16_t, 9160, Used, \
|
||||
"Thrift port for client connections." \
|
||||
) \
|
||||
val(start_rpc, bool, false, Used, \
|
||||
val(start_rpc, bool, true, Used, \
|
||||
"Starts the Thrift RPC server" \
|
||||
) \
|
||||
val(rpc_keepalive, bool, true, Used, \
|
||||
|
||||
@@ -665,13 +665,16 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
|
||||
auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());
|
||||
|
||||
for (auto&& key : diff.entries_only_on_left) {
|
||||
logger.info("Dropping keyspace {}", key);
|
||||
dropped.emplace(key);
|
||||
}
|
||||
for (auto&& key : diff.entries_only_on_right) {
|
||||
auto&& value = after[key];
|
||||
logger.info("Creating keyspace {}", key);
|
||||
created.emplace_back(schema_result_value_type{key, std::move(value)});
|
||||
}
|
||||
for (auto&& key : diff.entries_differing) {
|
||||
logger.info("Altering keyspace {}", key);
|
||||
altered.emplace_back(key);
|
||||
}
|
||||
return do_with(std::move(created), [&proxy, altered = std::move(altered)] (auto& created) mutable {
|
||||
@@ -713,15 +716,21 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
|
||||
std::map<qualified_name, schema_mutations>&& before,
|
||||
std::map<qualified_name, schema_mutations>&& after)
|
||||
{
|
||||
struct dropped_table {
|
||||
global_schema_ptr schema;
|
||||
utils::joinpoint<db_clock::time_point> jp{[] {
|
||||
return make_ready_future<db_clock::time_point>(db_clock::now());
|
||||
}};
|
||||
};
|
||||
std::vector<global_schema_ptr> created;
|
||||
std::vector<global_schema_ptr> altered;
|
||||
std::vector<global_schema_ptr> dropped;
|
||||
std::vector<dropped_table> dropped;
|
||||
|
||||
auto diff = difference(before, after);
|
||||
for (auto&& key : diff.entries_only_on_left) {
|
||||
auto&& s = proxy.local().get_db().local().find_schema(key.keyspace_name, key.table_name);
|
||||
logger.info("Dropping {}.{} id={} version={}", s->ks_name(), s->cf_name(), s->id(), s->version());
|
||||
dropped.emplace_back(s);
|
||||
dropped.emplace_back(dropped_table{s});
|
||||
}
|
||||
for (auto&& key : diff.entries_only_on_right) {
|
||||
auto s = create_table_from_mutations(after.at(key));
|
||||
@@ -734,14 +743,12 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
|
||||
altered.emplace_back(s);
|
||||
}
|
||||
|
||||
do_with(utils::make_joinpoint([] { return db_clock::now();})
|
||||
, [&created, &dropped, &altered, &proxy](auto& tsf) {
|
||||
return proxy.local().get_db().invoke_on_all([&created, &dropped, &altered, &tsf] (database& db) {
|
||||
proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
|
||||
return seastar::async([&] {
|
||||
for (auto&& gs : created) {
|
||||
schema_ptr s = gs.get();
|
||||
auto& ks = db.find_keyspace(s->ks_name());
|
||||
auto cfg = ks.make_column_family_config(*s);
|
||||
auto cfg = ks.make_column_family_config(*s, db.get_config());
|
||||
db.add_column_family(s, cfg);
|
||||
auto& cf = db.find_column_family(s);
|
||||
cf.mark_ready_for_writes();
|
||||
@@ -751,14 +758,13 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
|
||||
for (auto&& gs : altered) {
|
||||
update_column_family(db, gs.get()).get();
|
||||
}
|
||||
parallel_for_each(dropped.begin(), dropped.end(), [&db, &tsf](auto&& gs) {
|
||||
schema_ptr s = gs.get();
|
||||
return db.drop_column_family(s->ks_name(), s->cf_name(), [&tsf] { return tsf.value(); }).then([s] {
|
||||
parallel_for_each(dropped.begin(), dropped.end(), [&db](dropped_table& dt) {
|
||||
schema_ptr s = dt.schema.get();
|
||||
return db.drop_column_family(s->ks_name(), s->cf_name(), [&dt] { return dt.jp.value(); }).then([s] {
|
||||
return service::get_local_migration_manager().notify_drop_column_family(s);
|
||||
});
|
||||
}).get();
|
||||
});
|
||||
});
|
||||
}).get();
|
||||
}
|
||||
|
||||
|
||||
@@ -71,14 +71,30 @@ static std::vector<db::system_keyspace::range_estimates> estimates_for(const col
|
||||
std::vector<db::system_keyspace::range_estimates> estimates;
|
||||
estimates.reserve(local_ranges.size());
|
||||
|
||||
std::vector<query::partition_range> unwrapped;
|
||||
// Each range defines both bounds.
|
||||
for (auto& range : local_ranges) {
|
||||
int64_t count{0};
|
||||
sstables::estimated_histogram hist{0};
|
||||
for (auto&& sstable : cf.select_sstables(range)) {
|
||||
unwrapped.clear();
|
||||
if (range.is_wrap_around(dht::ring_position_comparator(*cf.schema()))) {
|
||||
auto uw = range.unwrap();
|
||||
unwrapped.push_back(std::move(uw.first));
|
||||
unwrapped.push_back(std::move(uw.second));
|
||||
} else {
|
||||
unwrapped.push_back(range);
|
||||
}
|
||||
for (auto&& uwr : unwrapped) {
|
||||
for (auto&& sstable : cf.select_sstables(uwr)) {
|
||||
count += sstable->get_estimated_key_count();
|
||||
hist.merge(sstable->get_stats_metadata().estimated_row_size);
|
||||
}
|
||||
}
|
||||
estimates.emplace_back(&range, db::system_keyspace::partition_estimates{count, count > 0 ? hist.mean() : 0});
|
||||
estimates.emplace_back(db::system_keyspace::range_estimates{
|
||||
range.start()->value().token(),
|
||||
range.end()->value().token(),
|
||||
count,
|
||||
count > 0 ? hist.mean() : 0});
|
||||
}
|
||||
|
||||
return estimates;
|
||||
@@ -130,7 +146,7 @@ future<> size_estimates_recorder::record_size_estimates() {
|
||||
}
|
||||
|
||||
future<> size_estimates_recorder::stop() {
|
||||
if (get_size_estimates_recorder().local_is_initialized()) {
|
||||
if (engine().cpu_id() == 0) {
|
||||
service::get_local_migration_manager().unregister_listener(this);
|
||||
_timer.cancel();
|
||||
return _gate.close();
|
||||
|
||||
@@ -1043,7 +1043,7 @@ void make(database& db, bool durable, bool volatile_testing_only) {
|
||||
db.add_keyspace(NAME, std::move(_ks));
|
||||
auto& ks = db.find_keyspace(NAME);
|
||||
for (auto&& table : all_tables()) {
|
||||
db.add_column_family(table, ks.make_column_family_config(*table));
|
||||
db.add_column_family(table, ks.make_column_family_config(*table, db.get_config()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1195,10 +1195,10 @@ future<int> increment_and_get_generation() {
|
||||
});
|
||||
}
|
||||
|
||||
future<> update_size_estimates(const sstring& ks_name, const sstring& cf_name, std::vector<range_estimates> estimates) {
|
||||
future<> update_size_estimates(sstring ks_name, sstring cf_name, std::vector<range_estimates> estimates) {
|
||||
auto&& schema = size_estimates();
|
||||
auto timestamp = api::new_timestamp();
|
||||
mutation m_to_apply{partition_key::from_singular(*schema, ks_name), schema};
|
||||
mutation m_to_apply{partition_key::from_single_value(*schema, to_bytes(ks_name)), schema};
|
||||
|
||||
// delete all previous values with a single range tombstone.
|
||||
auto ck = clustering_key_prefix::from_single_value(*schema, utf8_type->decompose(cf_name));
|
||||
@@ -1206,28 +1206,48 @@ future<> update_size_estimates(const sstring& ks_name, const sstring& cf_name, s
|
||||
|
||||
// add a CQL row for each primary token range.
|
||||
for (auto&& e : estimates) {
|
||||
// This range has both start and end bounds. We're only interested in the tokens.
|
||||
const range<dht::ring_position>* range = e.first;
|
||||
auto ck = clustering_key_prefix(std::vector<bytes>{
|
||||
utf8_type->decompose(cf_name),
|
||||
utf8_type->decompose(dht::global_partitioner().to_sstring(range->start()->value().token())),
|
||||
utf8_type->decompose(dht::global_partitioner().to_sstring(range->end()->value().token()))});
|
||||
utf8_type->decompose(dht::global_partitioner().to_sstring(e.range_start_token)),
|
||||
utf8_type->decompose(dht::global_partitioner().to_sstring(e.range_end_token))});
|
||||
|
||||
auto mean_partition_size_col = schema->get_column_definition("mean_partition_size");
|
||||
auto cell = atomic_cell::make_live(timestamp, long_type->decompose(e.second.mean_partition_size), { });
|
||||
auto cell = atomic_cell::make_live(timestamp, long_type->decompose(e.mean_partition_size), { });
|
||||
m_to_apply.set_clustered_cell(ck, *mean_partition_size_col, std::move(cell));
|
||||
|
||||
auto partitions_count_col = schema->get_column_definition("partitions_count");
|
||||
cell = atomic_cell::make_live(timestamp, long_type->decompose(e.second.partitions_count), { });
|
||||
cell = atomic_cell::make_live(timestamp, long_type->decompose(e.partitions_count), { });
|
||||
m_to_apply.set_clustered_cell(std::move(ck), *partitions_count_col, std::move(cell));
|
||||
}
|
||||
|
||||
return service::get_local_storage_proxy().mutate_locally(std::move(m_to_apply));
|
||||
}
|
||||
|
||||
future<> clear_size_estimates(const sstring& ks_name, const sstring& cf_name) {
|
||||
future<> clear_size_estimates(sstring ks_name, sstring cf_name) {
|
||||
sstring req = "DELETE FROM system.%s WHERE keyspace_name = ? AND table_name = ?";
|
||||
return execute_cql(req, SIZE_ESTIMATES, ks_name, cf_name).discard_result();
|
||||
return execute_cql(std::move(req), SIZE_ESTIMATES, std::move(ks_name), std::move(cf_name)).discard_result();
|
||||
}
|
||||
|
||||
future<std::vector<range_estimates>> query_size_estimates(sstring ks_name, sstring cf_name, dht::token start_token, dht::token end_token) {
|
||||
sstring req = "SELECT range_start, range_end, partitions_count, mean_partition_size FROM system.%s WHERE keyspace_name = ? AND table_name = ?";
|
||||
auto query_range = range<dht::token>::make({std::move(start_token)}, {std::move(end_token)});
|
||||
return execute_cql(req, SIZE_ESTIMATES, std::move(ks_name), std::move(cf_name))
|
||||
.then([query_range = std::move(query_range)](::shared_ptr<cql3::untyped_result_set> result) {
|
||||
std::vector<range_estimates> estimates;
|
||||
for (auto&& row : *result) {
|
||||
auto range_start = dht::global_partitioner().from_sstring(row.get_as<sstring>("range_start"));
|
||||
auto range_end = dht::global_partitioner().from_sstring(row.get_as<sstring>("range_end"));
|
||||
auto estimate_range = range<dht::token>::make({std::move(range_start)}, {std::move(range_end)});
|
||||
if (query_range.contains(estimate_range, &dht::tri_compare)) {
|
||||
estimates.emplace_back(range_estimates{
|
||||
std::move(*estimate_range.start()).value(),
|
||||
std::move(*estimate_range.end()).value(),
|
||||
row.get_as<int64_t>("partitions_count"),
|
||||
row.get_as<int64_t>("mean_partition_size")});
|
||||
}
|
||||
}
|
||||
return estimates;
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace system_keyspace
|
||||
|
||||
@@ -80,13 +80,13 @@ static constexpr auto SSTABLE_ACTIVITY = "sstable_activity";
|
||||
static constexpr auto SIZE_ESTIMATES = "size_estimates";
|
||||
|
||||
// Partition estimates for a given range of tokens.
|
||||
struct partition_estimates {
|
||||
struct range_estimates {
|
||||
dht::token range_start_token;
|
||||
dht::token range_end_token;
|
||||
int64_t partitions_count;
|
||||
int64_t mean_partition_size;
|
||||
};
|
||||
|
||||
using range_estimates = std::pair<const range<dht::ring_position>*, partition_estimates>;
|
||||
|
||||
extern schema_ptr hints();
|
||||
extern schema_ptr batchlog();
|
||||
extern schema_ptr built_indexes(); // TODO (from Cassandra): make private
|
||||
@@ -572,12 +572,17 @@ future<> set_bootstrap_state(bootstrap_state state);
|
||||
/**
|
||||
* Writes the current partition count and size estimates into SIZE_ESTIMATES_CF
|
||||
*/
|
||||
future<> update_size_estimates(const sstring& ks_name, const sstring& cf_name, std::vector<range_estimates> estimates);
|
||||
future<> update_size_estimates(sstring ks_name, sstring cf_name, std::vector<range_estimates> estimates);
|
||||
|
||||
/**
|
||||
* Clears size estimates for a table (on table drop)
|
||||
*/
|
||||
future<> clear_size_estimates(const sstring& ks_name, const sstring& cf_name);
|
||||
future<> clear_size_estimates(sstring ks_name, sstring cf_name);
|
||||
|
||||
/**
|
||||
* Queries the size estimates within the specified range
|
||||
*/
|
||||
future<std::vector<range_estimates>> query_size_estimates(sstring ks_name, sstring cf_name, dht::token start_token, dht::token end_token);
|
||||
|
||||
} // namespace system_keyspace
|
||||
} // namespace db
|
||||
|
||||
13
dist/ami/build_ami.sh
vendored
13
dist/ami/build_ami.sh
vendored
@@ -8,7 +8,7 @@ fi
|
||||
print_usage() {
|
||||
echo "build_ami.sh --localrpm --repo [URL]"
|
||||
echo " --localrpm deploy locally built rpms"
|
||||
echo " --repo specify repository URL"
|
||||
echo " --repo specify .repo/.list file URL"
|
||||
exit 1
|
||||
}
|
||||
LOCALRPM=0
|
||||
@@ -16,7 +16,8 @@ while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--localrpm")
|
||||
LOCALRPM=1
|
||||
INSTALL_ARGS="$INSTALL_ARGS --localrpm"
|
||||
REPO=`./scripts/scylla_current_repo`
|
||||
INSTALL_ARGS="$INSTALL_ARGS --localrpm --repo $REPO"
|
||||
shift 1
|
||||
;;
|
||||
"--repo")
|
||||
@@ -52,10 +53,13 @@ if [ $LOCALRPM -eq 1 ]; then
|
||||
if [ "$ID" = "centos" ]; then
|
||||
rm -rf build/*
|
||||
sudo yum -y install git
|
||||
if [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
|
||||
if [ ! -f dist/ami/files/scylla.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-kernel-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-debuginfo.x86_64.rpm ]; then
|
||||
dist/redhat/build_rpm.sh
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla.x86_64.rpm
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-kernel-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-kernel-conf.x86_64.rpm
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-conf.x86_64.rpm
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-debuginfo-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-debuginfo.x86_64.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||
cd build
|
||||
@@ -80,8 +84,11 @@ if [ $LOCALRPM -eq 1 ]; then
|
||||
echo "Build .deb before running build_ami.sh"
|
||||
exit 1
|
||||
fi
|
||||
cp ../scylla_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla_amd64.deb
|
||||
cp ../scylla-kernel-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-kernel-conf_amd64.deb
|
||||
cp ../scylla-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-conf_amd64.deb
|
||||
cp ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server_amd64.deb
|
||||
cp ../scylla-server-dbg_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server-dbg_amd64.deb
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
|
||||
cd build
|
||||
|
||||
49
dist/ami/files/.bash_profile
vendored
49
dist/ami/files/.bash_profile
vendored
@@ -11,6 +11,14 @@ PATH=$PATH:$HOME/.local/bin:$HOME/bin
|
||||
|
||||
export PATH
|
||||
|
||||
is_supported_instance_type() {
|
||||
TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
|
||||
case $TYPE in
|
||||
"m3"|"c3"|"i2") echo 1;;
|
||||
*) echo 0;;
|
||||
esac
|
||||
}
|
||||
|
||||
echo
|
||||
echo ' _____ _ _ _____ ____ '
|
||||
echo ' / ____| | | | | __ \| _ \ '
|
||||
@@ -28,9 +36,11 @@ echo 'CQL Shell:'
|
||||
echo ' cqlsh'
|
||||
echo 'More documentation available at: '
|
||||
echo ' http://www.scylladb.com/doc/'
|
||||
echo 'By default, Scylla sends certain information about this node to a data collection server. For information, see http://www.scylladb.com/privacy/'
|
||||
echo
|
||||
|
||||
. /etc/os-release
|
||||
|
||||
SETUP=0
|
||||
if [ "$ID" != "ubuntu" ]; then
|
||||
if [ "`systemctl status scylla-ami-setup|grep Active|grep exited`" = "" ]; then
|
||||
@@ -71,19 +81,34 @@ else
|
||||
tput sgr0
|
||||
echo
|
||||
else
|
||||
tput setaf 1
|
||||
tput bold
|
||||
echo " ScyllaDB is not started!"
|
||||
tput sgr0
|
||||
echo "Please wait for startup. To see status of ScyllaDB, run "
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
echo " 'initctl status scylla-server'"
|
||||
echo "and"
|
||||
echo " 'sudo cat /var/log/upstart/scylla-server.log'"
|
||||
echo
|
||||
if [ `is_supported_instance_type` -eq 0 ]; then
|
||||
TYPE=`curl -s http://169.254.169.254/latest/meta-data/instance-type`
|
||||
tput setaf 1
|
||||
tput bold
|
||||
echo " $TYPE is not supported instance type!"
|
||||
tput sgr0
|
||||
echo -n "To continue startup ScyllaDB on this instance, run 'scylla_io_setup' "
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
echo "then 'initctl start scylla-server'."
|
||||
else
|
||||
echo "then 'systemctl start scylla-server'."
|
||||
fi
|
||||
echo "To run ScyllaDB on supported instance type, run AMI in m3/c3/i2 types."
|
||||
else
|
||||
echo " 'systemctl status scylla-server'"
|
||||
echo
|
||||
tput setaf 1
|
||||
tput bold
|
||||
echo " ScyllaDB is not started!"
|
||||
tput sgr0
|
||||
echo "Please wait for startup. To see status of ScyllaDB, run "
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
echo " 'initctl status scylla-server'"
|
||||
echo "and"
|
||||
echo " 'sudo cat /var/log/upstart/scylla-server.log'"
|
||||
echo
|
||||
else
|
||||
echo " 'systemctl status scylla-server'"
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
2
dist/ami/files/scylla-ami
vendored
2
dist/ami/files/scylla-ami
vendored
Submodule dist/ami/files/scylla-ami updated: 863cc4598a...14c1666423
9
dist/common/scripts/scylla_coredump_setup
vendored
9
dist/common/scripts/scylla_coredump_setup
vendored
@@ -5,16 +5,22 @@
|
||||
print_usage() {
|
||||
echo "scylla_coredump_setup --dump-to-raiddir"
|
||||
echo " --dump-to-raiddir store coredump to /var/lib/scylla"
|
||||
echo " --compress enable compress on systemd-coredump"
|
||||
exit 1
|
||||
}
|
||||
|
||||
SYMLINK=0
|
||||
COMPRESS=no
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--dump-to-raiddir")
|
||||
SYMLINK=1
|
||||
shift 1
|
||||
;;
|
||||
"--compress")
|
||||
COMPRESS=yes
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -33,12 +39,13 @@ else
|
||||
cat << EOS > /etc/systemd/coredump.conf
|
||||
[Coredump]
|
||||
Storage=external
|
||||
Compress=yes
|
||||
Compress=$COMPRESS
|
||||
ProcessSizeMax=1024G
|
||||
ExternalSizeMax=1024G
|
||||
EOS
|
||||
if [ $SYMLINK = 1 ]; then
|
||||
rm -rf /var/lib/systemd/coredump
|
||||
mkdir -p /var/lib/scylla/coredump
|
||||
ln -sf /var/lib/scylla/coredump /var/lib/systemd/coredump
|
||||
fi
|
||||
systemctl daemon-reload
|
||||
|
||||
12
dist/common/scripts/scylla_setup
vendored
12
dist/common/scripts/scylla_setup
vendored
@@ -205,7 +205,7 @@ fi
|
||||
if [ $INTERACTIVE -eq 1 ]; then
|
||||
interactive_ask_service "Do you want to enable ScyllaDB services?" "Answer yes to automatically start Scylla when the node boots; answer no to skip this step." "yes" &&:
|
||||
ENABLE_SERVICE=$?
|
||||
if [ $ENABLE_SERVICE -eq 1 ]; then
|
||||
if [ $ENABLE_SERVICE -eq 1 ] && [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
|
||||
interactive_ask_service "Do you want to enable ScyllaDB version check?" "Answer yes to automatically start Scylla-housekeeping that check for newer version, when the node boots; answer no to skip this step." "yes" &&:
|
||||
ENABLE_CHECK_VERSION=$?
|
||||
fi
|
||||
@@ -219,6 +219,16 @@ if [ $ENABLE_SERVICE -eq 1 ]; then
|
||||
systemctl unmask scylla-housekeeping.timer
|
||||
else
|
||||
systemctl mask scylla-housekeeping.timer
|
||||
systemctl stop scylla-housekeeping.timer || true
|
||||
fi
|
||||
fi
|
||||
if [ $ENABLE_CHECK_VERSION -eq 1 ]; then
|
||||
if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
|
||||
printf "[housekeeping]\ncheck-version: True\n" > /etc/scylla.d/housekeeping.cfg
|
||||
fi
|
||||
else
|
||||
if [ ! -f /etc/scylla.d/housekeeping.cfg ]; then
|
||||
printf "[housekeeping]\ncheck-version: False\n" > /etc/scylla.d/housekeeping.cfg
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -5,7 +5,7 @@ Description=Scylla Housekeeping
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
ExecStart=/usr/lib/scylla/scylla-Housekeeping -q version
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping -q -c /etc/scylla.d/housekeeping.cfg version
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -4,6 +4,7 @@ After=scylla-server.service
|
||||
BindsTo=scylla-server.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=0
|
||||
OnUnitActiveSec=1d
|
||||
|
||||
[Install]
|
||||
1
dist/common/systemd/scylla-server.service.in
vendored
1
dist/common/systemd/scylla-server.service.in
vendored
@@ -12,7 +12,6 @@ LimitAS=infinity
|
||||
LimitNPROC=8096
|
||||
EnvironmentFile=@@SYSCONFDIR@@/scylla-server
|
||||
EnvironmentFile=/etc/scylla.d/*.conf
|
||||
WorkingDirectory=$SCYLLA_HOME
|
||||
ExecStartPre=/usr/lib/scylla/scylla_prepare
|
||||
ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET
|
||||
ExecStopPost=/usr/lib/scylla/scylla_stop
|
||||
|
||||
42
dist/docker/redhat/Dockerfile
vendored
42
dist/docker/redhat/Dockerfile
vendored
@@ -2,24 +2,42 @@ FROM centos:7
|
||||
|
||||
MAINTAINER Avi Kivity <avi@cloudius-systems.com>
|
||||
|
||||
RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo
|
||||
#enable systemd
|
||||
ENV container docker
|
||||
VOLUME [ "/sys/fs/cgroup" ]
|
||||
|
||||
#install scylla
|
||||
RUN curl http://downloads.scylladb.com/rpm/centos/scylla-1.3.repo -o /etc/yum.repos.d/scylla.repo
|
||||
RUN yum -y install epel-release
|
||||
RUN yum -y clean expire-cache
|
||||
RUN yum -y update
|
||||
RUN yum -y remove boost-thread boost-system
|
||||
RUN yum -y install scylla hostname
|
||||
RUN yum -y install scylla hostname supervisor
|
||||
RUN yum clean all
|
||||
|
||||
ADD start-scylla /start-scylla
|
||||
RUN chown scylla /start-scylla
|
||||
#install python3 for our main script
|
||||
RUN yum -y install python34 python34-PyYAML
|
||||
|
||||
ADD bashrc /var/lib/scylla/.bashrc
|
||||
RUN chown scylla /var/lib/scylla/.bashrc
|
||||
RUN chown -R scylla:scylla /etc/scylla
|
||||
RUN chown -R scylla:scylla /etc/scylla.d
|
||||
ADD scylla_bashrc /scylla_bashrc
|
||||
RUN cat /scylla_bashrc >> /etc/bashrc
|
||||
|
||||
# Scylla configuration:
|
||||
ADD etc/sysconfig/scylla-server /etc/sysconfig/scylla-server
|
||||
|
||||
# Supervisord configuration:
|
||||
ADD etc/supervisord.conf /etc/supervisord.conf
|
||||
RUN mkdir -p /etc/supervisor.conf.d
|
||||
ADD etc/supervisord.conf.d/scylla-server.conf /etc/supervisord.conf.d/scylla-server.conf
|
||||
ADD etc/supervisord.conf.d/scylla-jmx.conf /etc/supervisord.conf.d/scylla-jmx.conf
|
||||
RUN mkdir -p /var/log/scylla
|
||||
ADD scylla-service.sh /scylla-service.sh
|
||||
ADD scylla-jmx-service.sh /scylla-jmx-service.sh
|
||||
|
||||
ADD scyllasetup.py /scyllasetup.py
|
||||
ADD commandlineparser.py /commandlineparser.py
|
||||
ADD docker-entrypoint.py /docker-entrypoint.py
|
||||
ENTRYPOINT ["/docker-entrypoint.py"]
|
||||
|
||||
USER scylla
|
||||
EXPOSE 10000 9042 9160 7000 7001
|
||||
VOLUME /var/lib/scylla
|
||||
|
||||
CMD /start-scylla && /bin/bash
|
||||
VOLUME [ "/var/lib/scylla" ]
|
||||
RUN chown -R scylla.scylla /var/lib/scylla
|
||||
|
||||
20
dist/docker/redhat/bashrc
vendored
20
dist/docker/redhat/bashrc
vendored
@@ -1,20 +0,0 @@
|
||||
echo
|
||||
echo ' _____ _ _ _____ ____ '
|
||||
echo ' / ____| | | | | __ \| _ \ '
|
||||
echo ' | (___ ___ _ _| | | __ _| | | | |_) |'
|
||||
echo ' \___ \ / __| | | | | |/ _` | | | | _ < '
|
||||
echo ' ____) | (__| |_| | | | (_| | |__| | |_) |'
|
||||
echo ' |_____/ \___|\__, |_|_|\__,_|_____/|____/ '
|
||||
echo ' __/ | '
|
||||
echo ' |___/ '
|
||||
echo ''
|
||||
echo ''
|
||||
echo 'Nodetool:'
|
||||
echo ' nodetool help'
|
||||
echo 'CQL Shell:'
|
||||
echo ' cqlsh'
|
||||
echo 'More documentation available at: '
|
||||
echo ' http://www.scylladb.com/doc/'
|
||||
echo
|
||||
|
||||
export CQLSH_HOST=$(hostname -i)
|
||||
14
dist/docker/redhat/commandlineparser.py
vendored
Normal file
14
dist/docker/redhat/commandlineparser.py
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
import argparse
|
||||
|
||||
|
||||
def parse():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--developer-mode', default='1', choices=['0', '1'], dest='developerMode')
|
||||
parser.add_argument('--seeds', default=None, help="specify seeds - if left empty will use container's own IP")
|
||||
parser.add_argument('--cpuset', default=None, help="e.g. --cpuset 0-3 for the first four CPUs")
|
||||
parser.add_argument('--smp', default=None, help="e.g --smp 2 to use two CPUs")
|
||||
parser.add_argument('--memory', default=None, help="e.g. --memory 1G to use 1 GB of RAM")
|
||||
parser.add_argument('--overprovisioned', default='0', choices=['0', '1'], help="run in overprovisioned environment")
|
||||
parser.add_argument('--broadcast-address', default=None, dest='broadcastAddress')
|
||||
parser.add_argument('--broadcast-rpc-address', default=None, dest='broadcastRpcAddress')
|
||||
return parser.parse_args()
|
||||
20
dist/docker/redhat/docker-entrypoint.py
vendored
Executable file
20
dist/docker/redhat/docker-entrypoint.py
vendored
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import scyllasetup
|
||||
import logging
|
||||
import commandlineparser
|
||||
|
||||
logging.basicConfig(filename="/var/log/scylla/docker-entrypoint.log", level=logging.DEBUG, format="%(message)s")
|
||||
|
||||
try:
|
||||
arguments = commandlineparser.parse()
|
||||
setup = scyllasetup.ScyllaSetup(arguments)
|
||||
setup.developerMode()
|
||||
setup.cpuSet()
|
||||
setup.io()
|
||||
setup.scyllaYAML()
|
||||
setup.cqlshrc()
|
||||
setup.arguments()
|
||||
os.system("/usr/bin/supervisord -c /etc/supervisord.conf")
|
||||
except:
|
||||
logging.exception('failed!')
|
||||
13
dist/docker/redhat/etc/supervisord.conf
vendored
Normal file
13
dist/docker/redhat/etc/supervisord.conf
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
|
||||
[inet_http_server]
|
||||
port = 127.0.0.1:9001
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
|
||||
[supervisorctl]
|
||||
|
||||
[include]
|
||||
files = /etc/supervisord.conf.d/*.conf
|
||||
6
dist/docker/redhat/etc/supervisord.conf.d/scylla-jmx.conf
vendored
Normal file
6
dist/docker/redhat/etc/supervisord.conf.d/scylla-jmx.conf
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
[program:scylla-jmx]
|
||||
command=/scylla-jmx-service.sh
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
6
dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
vendored
Normal file
6
dist/docker/redhat/etc/supervisord.conf.d/scylla-server.conf
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
[program:scylla]
|
||||
command=/scylla-service.sh
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
41
dist/docker/redhat/etc/sysconfig/scylla-server
vendored
Normal file
41
dist/docker/redhat/etc/sysconfig/scylla-server
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
# choose following mode: virtio, dpdk, posix
|
||||
NETWORK_MODE=posix
|
||||
|
||||
# tap device name(virtio)
|
||||
TAP=tap0
|
||||
|
||||
# bridge device name (virtio)
|
||||
BRIDGE=virbr0
|
||||
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's interrupts, RPS, XPS (posix)
|
||||
SET_NIC=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
# ethernet device PCI ID (dpdk)
|
||||
ETHPCIID=
|
||||
|
||||
# number of hugepages
|
||||
NR_HUGEPAGES=64
|
||||
|
||||
# user for process (must be root for dpdk)
|
||||
USER=scylla
|
||||
|
||||
# group for process
|
||||
GROUP=scylla
|
||||
|
||||
# scylla home dir
|
||||
SCYLLA_HOME=/var/lib/scylla
|
||||
|
||||
# scylla config dir
|
||||
SCYLLA_CONF=/etc/scylla
|
||||
|
||||
# scylla arguments
|
||||
SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --collectd-address=127.0.0.1:25826 --collectd=1 --collectd-poll-period 3000 --network-stack posix"
|
||||
|
||||
# setup as AMI instance
|
||||
AMI=no
|
||||
5
dist/docker/redhat/scylla-jmx-service.sh
vendored
Executable file
5
dist/docker/redhat/scylla-jmx-service.sh
vendored
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
source /etc/sysconfig/scylla-jmx
|
||||
|
||||
exec /usr/lib/scylla/jmx/scylla-jmx -l /usr/lib/scylla/jmx
|
||||
7
dist/docker/redhat/scylla-service.sh
vendored
Executable file
7
dist/docker/redhat/scylla-service.sh
vendored
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
. /usr/lib/scylla/scylla_prepare
|
||||
|
||||
export SCYLLA_HOME SCYLLA_CONF
|
||||
|
||||
exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE $CPUSET $SCYLLA_DOCKER_ARGS
|
||||
18
dist/docker/redhat/scylla_bashrc
vendored
Normal file
18
dist/docker/redhat/scylla_bashrc
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
echo > /dev/stderr
|
||||
echo ' _____ _ _ _____ ____ ' > /dev/stderr
|
||||
echo ' / ____| | | | | __ \| _ \ ' > /dev/stderr
|
||||
echo ' | (___ ___ _ _| | | __ _| | | | |_) |' > /dev/stderr
|
||||
echo ' \___ \ / __| | | | | |/ _` | | | | _ < ' > /dev/stderr
|
||||
echo ' ____) | (__| |_| | | | (_| | |__| | |_) |' > /dev/stderr
|
||||
echo ' |_____/ \___|\__, |_|_|\__,_|_____/|____/ ' > /dev/stderr
|
||||
echo ' __/ | ' > /dev/stderr
|
||||
echo ' |___/ ' > /dev/stderr
|
||||
echo '' > /dev/stderr
|
||||
echo '' > /dev/stderr
|
||||
echo 'Nodetool:' > /dev/stderr
|
||||
echo ' nodetool help' > /dev/stderr
|
||||
echo 'CQL Shell:' > /dev/stderr
|
||||
echo ' cqlsh' > /dev/stderr
|
||||
echo 'More documentation available at: ' > /dev/stderr
|
||||
echo ' http://www.scylladb.com/doc/' > /dev/stderr
|
||||
echo > /dev/stderr
|
||||
68
dist/docker/redhat/scyllasetup.py
vendored
Normal file
68
dist/docker/redhat/scyllasetup.py
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
import subprocess
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
|
||||
class ScyllaSetup:
|
||||
def __init__(self, arguments):
|
||||
self._developerMode = arguments.developerMode
|
||||
self._seeds = arguments.seeds
|
||||
self._cpuset = arguments.cpuset
|
||||
self._broadcastAddress = arguments.broadcastAddress
|
||||
self._broadcastRpcAddress = arguments.broadcastRpcAddress
|
||||
self._smp = arguments.smp
|
||||
self._memory = arguments.memory
|
||||
self._overprovisioned = arguments.overprovisioned
|
||||
|
||||
def _run(self, *args, **kwargs):
|
||||
logging.info('running: {}'.format(args))
|
||||
subprocess.check_call(*args, **kwargs)
|
||||
|
||||
def developerMode(self):
|
||||
self._run(['/usr/lib/scylla/scylla_dev_mode_setup', '--developer-mode', self._developerMode])
|
||||
|
||||
def cpuSet(self):
|
||||
if self._cpuset is None:
|
||||
return
|
||||
self._run(['/usr/lib/scylla/scylla_cpuset_setup', '--cpuset', self._cpuset])
|
||||
|
||||
def io(self):
|
||||
self._run(['/usr/lib/scylla/scylla_io_setup'])
|
||||
|
||||
def scyllaYAML(self):
|
||||
configuration = yaml.load(open('/etc/scylla/scylla.yaml'))
|
||||
IP = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
|
||||
configuration['listen_address'] = IP
|
||||
configuration['rpc_address'] = IP
|
||||
if self._seeds is None:
|
||||
if self._broadcastAddress is not None:
|
||||
self._seeds = self._broadcastAddress
|
||||
else:
|
||||
self._seeds = IP
|
||||
configuration['seed_provider'] = [
|
||||
{'class_name': 'org.apache.cassandra.locator.SimpleSeedProvider',
|
||||
'parameters': [{'seeds': self._seeds}]}
|
||||
]
|
||||
if self._broadcastAddress is not None:
|
||||
configuration['broadcast_address'] = self._broadcastAddress
|
||||
if self._broadcastRpcAddress is not None:
|
||||
configuration['broadcast_rpc_address'] = self._broadcastRpcAddress
|
||||
with open('/etc/scylla/scylla.yaml', 'w') as file:
|
||||
yaml.dump(configuration, file)
|
||||
|
||||
def cqlshrc(self):
|
||||
home = os.environ['HOME']
|
||||
hostname = subprocess.check_output(['hostname', '-i']).decode('ascii').strip()
|
||||
with open("%s/.cqlshrc" % home, "w") as cqlshrc:
|
||||
cqlshrc.write("[connection]\nhostname = %s\n" % hostname)
|
||||
|
||||
def arguments(self):
|
||||
args = ""
|
||||
if self._memory is not None:
|
||||
args += "--memory %s" % self._memory
|
||||
if self._smp is not None:
|
||||
args += " --smp %s" % self._smp
|
||||
if self._overprovisioned == "1":
|
||||
args += " --overprovisioned"
|
||||
with open("/etc/scylla.d/docker.conf", "w") as cqlshrc:
|
||||
cqlshrc.write("SCYLLA_DOCKER_ARGS=\"%s\"\n" % args)
|
||||
54
dist/docker/redhat/start-scylla
vendored
54
dist/docker/redhat/start-scylla
vendored
@@ -1,54 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
. /etc/sysconfig/scylla-server
|
||||
|
||||
CPUSET=""
|
||||
if [ x"$SCYLLA_CPU_SET" != "x" ]; then
|
||||
CPUSET="--cpuset $SCYLLA_CPU_SET"
|
||||
fi
|
||||
|
||||
if [ "$SCYLLA_PRODUCTION" == "true" ]; then
|
||||
DEV_MODE=""
|
||||
if [ ! -f /var/lib/scylla/.io_setup_done ]; then
|
||||
DATA_DIR=`/usr/lib/scylla/scylla_config_get.py --config $SCYLLA_CONF/scylla.yaml --get data_file_directories|head -n1`
|
||||
iotune --evaluation-directory $DATA_DIR --format envfile --options-file /var/lib/scylla/io.conf $CPUSET --timeout 600
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
|
||||
echo "This is a non-supported setup, please bind mount an XFS volume."
|
||||
exit 1
|
||||
fi
|
||||
touch /var/lib/scylla/.io_setup_done
|
||||
fi
|
||||
source /var/lib/scylla/io.conf
|
||||
else
|
||||
DEV_MODE="--developer-mode true"
|
||||
fi
|
||||
|
||||
IP=$(hostname -i)
|
||||
|
||||
if [ x"$SCYLLA_SEEDS" != "x" ];then
|
||||
SEEDS="$SCYLLA_SEEDS"
|
||||
else
|
||||
SEEDS="$IP"
|
||||
fi
|
||||
|
||||
sed -e "s/seeds:.*/seeds: $SEEDS/g" /var/lib/scylla/conf/scylla.yaml > $HOME/scylla.yaml
|
||||
|
||||
if [ x"$SCYLLA_BROADCAST_ADDRESS" != "x" ];then
|
||||
sed -i "s/.*broadcast_address:.*/broadcast_address: $SCYLLA_BROADCAST_ADDRESS/g" $HOME/scylla.yaml
|
||||
fi
|
||||
|
||||
/usr/bin/scylla --log-to-syslog 1 \
|
||||
--log-to-stdout 0 \
|
||||
$DEV_MODE \
|
||||
$SEASTAR_IO \
|
||||
$CPUSET \
|
||||
--default-log-level info \
|
||||
--options-file $HOME/scylla.yaml \
|
||||
--listen-address $IP \
|
||||
--rpc-address $IP \
|
||||
--network-stack posix &> /dev/null &
|
||||
|
||||
source /etc/sysconfig/scylla-jmx
|
||||
export SCYLLA_HOME SCYLLA_CONF
|
||||
exec /usr/lib/scylla/jmx/scylla-jmx -l /usr/lib/scylla/jmx &> /dev/null &
|
||||
13
dist/redhat/build_rpm.sh
vendored
13
dist/redhat/build_rpm.sh
vendored
@@ -5,10 +5,12 @@ print_usage() {
|
||||
echo "build_rpm.sh --rebuild-dep --jobs 2"
|
||||
echo " --rebuild-dep rebuild dependency packages (CentOS)"
|
||||
echo " --jobs specify number of jobs"
|
||||
echo " --dist create a public distribution rpm"
|
||||
exit 1
|
||||
}
|
||||
REBUILD=0
|
||||
JOBS=0
|
||||
DIST=0
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--rebuild-dep")
|
||||
@@ -19,6 +21,10 @@ while [ $# -gt 0 ]; do
|
||||
JOBS=$2
|
||||
shift 2
|
||||
;;
|
||||
"--dist")
|
||||
DIST=1
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -62,6 +68,13 @@ rm -f version
|
||||
cp dist/redhat/scylla.spec.in $RPMBUILD/SPECS/scylla.spec
|
||||
sed -i -e "s/@@VERSION@@/$SCYLLA_VERSION/g" $RPMBUILD/SPECS/scylla.spec
|
||||
sed -i -e "s/@@RELEASE@@/$SCYLLA_RELEASE/g" $RPMBUILD/SPECS/scylla.spec
|
||||
|
||||
if [ $DIST -gt 0 ]; then
|
||||
sed -i -e "s/@@HOUSEKEEPING_CONF@@/true/g" $RPMBUILD/SPECS/scylla.spec
|
||||
else
|
||||
sed -i -e "s/@@HOUSEKEEPING_CONF@@/false/g" $RPMBUILD/SPECS/scylla.spec
|
||||
fi
|
||||
|
||||
if [ "$ID" = "fedora" ]; then
|
||||
if [ $JOBS -gt 0 ]; then
|
||||
rpmbuild -bs --define "_topdir $RPMBUILD" --define "_smp_mflags -j$JOBS" $RPMBUILD/SPECS/scylla.spec
|
||||
|
||||
15
dist/redhat/scylla.spec.in
vendored
15
dist/redhat/scylla.spec.in
vendored
@@ -30,7 +30,7 @@ URL: http://www.scylladb.com/
|
||||
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel openssl-devel libcap-devel libselinux-devel libgcrypt-devel libgpg-error-devel elfutils-devel krb5-devel libcom_err-devel libattr-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel
|
||||
%{?fedora:BuildRequires: boost-devel ninja-build ragel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++-static scylla-boost-devel scylla-ninja-build scylla-ragel scylla-antlr3-tool scylla-antlr3-C++-devel python34 scylla-gcc-c++ >= 5.1.1, python34-pyparsing}
|
||||
Requires: scylla-conf systemd-libs hwloc collectd PyYAML python-urwid
|
||||
Requires: scylla-conf systemd-libs hwloc collectd PyYAML python-urwid pyparsing python-requests curl
|
||||
Conflicts: abrt
|
||||
|
||||
%description server
|
||||
@@ -42,6 +42,7 @@ This package contains ScyllaDB server.
|
||||
%{nil}
|
||||
|
||||
%build
|
||||
%define is_housekeeping_conf %( if @@HOUSEKEEPING_CONF@@; then echo "1" ; else echo "0"; fi )
|
||||
%if 0%{?fedora}
|
||||
./configure.py --disable-xen --enable-dpdk --mode=release
|
||||
%endif
|
||||
@@ -50,8 +51,6 @@ python3.4 ./configure.py --disable-xen --enable-dpdk --mode=release --static-std
|
||||
%endif
|
||||
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
||||
cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
|
||||
cp dist/common/systemd/scylla-housekeeping.service.in build/scylla-housekeeping.service
|
||||
cp dist/common/systemd/scylla-housekeeping.timer.in build/scylla-housekeeping.timer
|
||||
sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service
|
||||
|
||||
%install
|
||||
@@ -77,7 +76,8 @@ install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
|
||||
install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
install -m644 build/*.service $RPM_BUILD_ROOT%{_unitdir}/
|
||||
install -m644 build/*.timer $RPM_BUILD_ROOT%{_unitdir}/
|
||||
install -m644 dist/common/systemd/*.service $RPM_BUILD_ROOT%{_unitdir}/
|
||||
install -m644 dist/common/systemd/*.timer $RPM_BUILD_ROOT%{_unitdir}/
|
||||
install -m755 dist/common/scripts/* $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
|
||||
install -m755 seastar/scripts/posix_net_conf.sh $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
|
||||
install -m755 seastar/dpdk/tools/dpdk_nic_bind.py $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
|
||||
@@ -85,6 +85,9 @@ install -m755 build/release/scylla $RPM_BUILD_ROOT%{_bindir}
|
||||
install -m755 build/release/iotune $RPM_BUILD_ROOT%{_bindir}
|
||||
install -m755 dist/common/bin/scyllatop $RPM_BUILD_ROOT%{_bindir}
|
||||
install -m755 scylla-housekeeping $RPM_BUILD_ROOT%{_prefix}/lib/scylla/
|
||||
if @@HOUSEKEEPING_CONF@@; then
|
||||
install -m644 conf/housekeeping.cfg $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
fi
|
||||
install -d -m755 $RPM_BUILD_ROOT%{_docdir}/scylla
|
||||
install -m644 README.md $RPM_BUILD_ROOT%{_docdir}/scylla/
|
||||
install -m644 README-DPDK.md $RPM_BUILD_ROOT%{_docdir}/scylla/
|
||||
@@ -202,6 +205,10 @@ mv /tmp/scylla.yaml /etc/scylla/scylla.yaml
|
||||
%attr(0755,root,root) %dir %{_sysconfdir}/scylla
|
||||
%config(noreplace) %{_sysconfdir}/scylla/scylla.yaml
|
||||
%config(noreplace) %{_sysconfdir}/scylla/cassandra-rackdc.properties
|
||||
%if %is_housekeeping_conf
|
||||
%config(noreplace) %{_sysconfdir}/scylla/housekeeping.cfg
|
||||
%endif
|
||||
|
||||
|
||||
%package kernel-conf
|
||||
Group: Applications/Databases
|
||||
|
||||
15
dist/ubuntu/build_deb.sh
vendored
15
dist/ubuntu/build_deb.sh
vendored
@@ -2,16 +2,22 @@
|
||||
|
||||
print_usage() {
|
||||
echo "build_deb.sh --rebuild-dep"
|
||||
echo " --dist create a public distribution package"
|
||||
echo " --rebuild-dep rebuild dependency packages"
|
||||
exit 1
|
||||
}
|
||||
REBUILD=0
|
||||
DIST=0
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--rebuild-dep")
|
||||
REBUILD=1
|
||||
shift 1
|
||||
;;
|
||||
"--dist")
|
||||
DIST=1
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -74,13 +80,19 @@ if [ "$RELEASE" = "14.04" ]; then
|
||||
sed -i -e "s/@@COMPILER@@/g++-5/g" debian/rules
|
||||
sed -i -e "s/@@BUILD_DEPENDS@@/g++-5/g" debian/control
|
||||
sed -i -e "s#@@INSTALL@@#dist/ubuntu/sudoers.d/scylla etc/sudoers.d#g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER@@##g" debian/scylla-server.install
|
||||
else
|
||||
sed -i -e "s/@@DH_INSTALLINIT@@//g" debian/rules
|
||||
sed -i -e "s/@@COMPILER@@/g++/g" debian/rules
|
||||
sed -i -e "s/@@BUILD_DEPENDS@@/libsystemd-dev, g++/g" debian/control
|
||||
sed -i -e "s#@@INSTALL@@##g" debian/scylla-server.install
|
||||
sed -i -e "s#@@HKDOTTIMER@@#dist/common/systemd/scylla-housekeeping.timer /lib/systemd/system#g" debian/scylla-server.install
|
||||
fi
|
||||
if [ $DIST -gt 0 ]; then
|
||||
sed -i -e "s#@@ADDHKCFG@@#conf/housekeeping.cfg etc/scylla.d/#g" debian/scylla-server.install
|
||||
else
|
||||
sed -i -e "s#@@ADDHKCFG@@##g" debian/scylla-server.install
|
||||
fi
|
||||
|
||||
if [ "$DISTRIBUTION" = "Ubuntu" ]; then
|
||||
sed -i -e "s/@@DEPENDS@@/hugepages, /g" debian/control
|
||||
else
|
||||
@@ -89,6 +101,7 @@ fi
|
||||
|
||||
cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
|
||||
sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
|
||||
cp dist/common/systemd/scylla-housekeeping.service debian/scylla-server.scylla-housekeeping.service
|
||||
|
||||
if [ "$RELEASE" = "14.04" ] && [ $REBUILD -eq 0 ]; then
|
||||
if [ ! -f /etc/apt/sources.list.d/scylla-3rdparty-trusty.list ]; then
|
||||
|
||||
2
dist/ubuntu/control.in
vendored
2
dist/ubuntu/control.in
vendored
@@ -16,7 +16,7 @@ Conflicts: scylla-server (<< 1.1)
|
||||
|
||||
Package: scylla-server
|
||||
Architecture: amd64
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, collectd, scylla-conf, python-yaml, python-urwid, python3-requests, @@DEPENDS@@
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}, adduser, hwloc-nox, collectd, scylla-conf, python-yaml, python-urwid, python-requests, curl, @@DEPENDS@@
|
||||
Description: Scylla database server binaries
|
||||
Scylla is a highly scalable, eventually consistent, distributed,
|
||||
partitioned row DB.
|
||||
|
||||
2
dist/ubuntu/rules.in
vendored
2
dist/ubuntu/rules.in
vendored
@@ -11,7 +11,7 @@ override_dh_auto_clean:
|
||||
|
||||
override_dh_installinit:
|
||||
dh_installinit --no-start @@DH_INSTALLINIT@@
|
||||
dh_installinit --no-start --name scylla-timer @@DH_INSTALLINIT@@
|
||||
dh_installinit --no-start --name scylla-housekeeping @@DH_INSTALLINIT@@
|
||||
|
||||
override_dh_strip:
|
||||
dh_strip --dbg-package=scylla-server-dbg
|
||||
|
||||
2
dist/ubuntu/scylla-server.install.in
vendored
2
dist/ubuntu/scylla-server.install.in
vendored
@@ -14,4 +14,6 @@ build/release/scylla usr/bin
|
||||
build/release/iotune usr/bin
|
||||
dist/common/bin/scyllatop usr/bin
|
||||
dist/common/sbin/* usr/sbin
|
||||
@@ADDHKCFG@@
|
||||
@@HKDOTTIMER@@
|
||||
@@INSTALL@@
|
||||
|
||||
190
docs/docker-hub.md
Normal file
190
docs/docker-hub.md
Normal file
@@ -0,0 +1,190 @@
|
||||
# What is ScyllaDB ?
|
||||
|
||||
ScyllaDB is a high-performance Cassandra implementation written in C++14. Classified as a NoSQL database, ScyllaDB deliver a high number of transactions per seconds making it one of the fastest database on the planet. ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
|
||||
|
||||
> [ScyllaDB](http://www.scylladb.com/)
|
||||
|
||||

|
||||
|
||||
# How to use this image
|
||||
|
||||
## Start a `scylla` server instance
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla
|
||||
```
|
||||
|
||||
## Run `nodetool` utility
|
||||
|
||||
```console
|
||||
$ docker exec -it some-scylla nodetool status
|
||||
Datacenter: datacenter1
|
||||
=======================
|
||||
Status=Up/Down
|
||||
|/ State=Normal/Leaving/Joining/Moving
|
||||
-- Address Load Tokens Owns (effective) Host ID Rack
|
||||
UN 172.17.0.2 125.51 KB 256 100.0% c9155121-786d-44f8-8667-a8b915b95665 rack1
|
||||
```
|
||||
|
||||
## Run `cqlsh` utility
|
||||
|
||||
```console
|
||||
$ docker exec -it some-scylla cqlsh
|
||||
Connected to Test Cluster at 172.17.0.2:9042.
|
||||
[cqlsh 5.0.1 | Cassandra 2.1.8 | CQL spec 3.2.1 | Native protocol v3]
|
||||
Use HELP for help.
|
||||
cqlsh>
|
||||
```
|
||||
|
||||
## Make a cluster
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla2 -d scylladb/scylla --seeds="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' some-scylla)"
|
||||
```
|
||||
|
||||
## Check `scylla` logs
|
||||
|
||||
```console
|
||||
$ docker logs some-scylla | tail
|
||||
INFO 2016-08-04 06:57:40,836 [shard 5] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,836 [shard 3] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,836 [shard 1] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,836 [shard 2] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,836 [shard 4] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,836 [shard 7] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,837 [shard 6] database - Setting compaction strategy of system_traces.events to SizeTieredCompactionStrategy
|
||||
INFO 2016-08-04 06:57:40,839 [shard 0] database - Schema version changed to fea14d93-9c5a-34f5-9d0e-2e49dcfa747e
|
||||
INFO 2016-08-04 06:57:40,839 [shard 0] storage_service - Starting listening for CQL clients on 172.17.0.2:9042...
|
||||
INFO 2016-08-04 06:57:40,840 [shard 0] storage_service - Thrift server listening on 172.17.0.2:9160 ...
|
||||
```
|
||||
|
||||
## Configuring data volume for storage
|
||||
|
||||
You can use Docker volumes to improve performance of Scylla.
|
||||
|
||||
Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:
|
||||
|
||||
```console
|
||||
$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog
|
||||
```
|
||||
|
||||
Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla --volume /var/lib/scylla:/var/lib/scylla -d scylladb/scylla --developer-mode=0
|
||||
```
|
||||
|
||||
## Configuring resource limits
|
||||
|
||||
Scylla utilizes all CPUs and all memory by default.
|
||||
To configure resource limits for your Docker container, you can use the `--smp`, `--memory`, and `--cpuset` command line options documented in the section "Command-line options".
|
||||
|
||||
If you run multiple Scylla instances on the same machine, it is highly recommended that you enable the `--overprovisioned` command line option, which enables certain optimizations for Scylla to run efficiently in an overprovisioned environment.
|
||||
|
||||
## Command-line options
|
||||
|
||||
The Scylla image supports many command line options that are passed to the `docker run` command.
|
||||
|
||||
### `--seeds SEEDS`
|
||||
|
||||
The `-seeds` command line option configures Scylla's seed nodes.
|
||||
If no `--seeds` option is specified, Scylla uses its own IP address as the seed.
|
||||
|
||||
For example, to configure Scylla to run with two seed nodes `192.168.0.100` and `192.168.0.200`.
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --seeds 192.168.0.100,192.168.0.200
|
||||
```
|
||||
|
||||
### `--broadcast-address ADDR`
|
||||
|
||||
The `--broadcast-address` command line option configures the IP address the Scylla instance tells other Scylla nodes in the cluster to connect to.
|
||||
|
||||
For example, to configure Scylla to use broadcast address `10.0.0.5`:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --broadcast-address 10.0.0.5
|
||||
```
|
||||
|
||||
### `--broadcast-rpc-address ADDR`
|
||||
|
||||
The `--broadcast-rpc-address` command line option configures the IP address the Scylla instance tells clients to connect to.
|
||||
|
||||
For example, to configure Scylla to use broadcast RPC address `10.0.0.5`:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --broadcast-rpc-address 10.0.0.5
|
||||
```
|
||||
|
||||
### `--smp COUNT`
|
||||
|
||||
The `--smp` command line option restricts Scylla to `COUNT` number of CPUs.
|
||||
The option does not, however, mandate a specific placement of CPUs.
|
||||
See the `--cpuset` command line option if you need Scylla to run on specific CPUs.
|
||||
|
||||
For example, to restrict Scylla to 2 CPUs:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --smp 2
|
||||
```
|
||||
|
||||
### `--memory AMOUNT`
|
||||
|
||||
The `--memory` command line options restricts Scylla to use up to `AMOUNT` of memory.
|
||||
The `AMOUNT` value supports both `M` unit for megabytes and `G` unit for gigabytes.
|
||||
|
||||
For example, to restrict Scylla to 4 GB of memory:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --memory 4G
|
||||
```
|
||||
|
||||
### `--overprovisioned ENABLE`
|
||||
|
||||
The `--overprovisioned` command line option enables or disables optimizations for running Scylla in an overprovisioned environment.
|
||||
If no `--overprovisioned` option is specified, Scylla defaults to running with optimizations *disabled*.
|
||||
|
||||
For example, to enable optimizations for running in an overprovisioned environment:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --overprovisioned 1
|
||||
```
|
||||
|
||||
### `--cpuset CPUSET`
|
||||
|
||||
The `--cpuset` command line option restricts Scylla to run on only on CPUs specified by `CPUSET`.
|
||||
The `CPUSET` value is either a single CPU (e.g. `--cpuset 1`), a range (e.g. `--cpuset 2-3`), or a list (e.g. `--cpuset 1,2,5`), or a combination of the last two options (e.g. `--cpuset 1-2,5`).
|
||||
|
||||
For example, to restrict Scylla to run on physical CPUs 0 to 2 and 4:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --cpuset 0-2,4
|
||||
```
|
||||
|
||||
### `--developer-mode ENABLE`
|
||||
|
||||
The `--developer-mode` command line option enables Scylla's developer mode, which relaxes checks for things like XFS and enables Scylla to run on unsupported configurations (which usually results in suboptimal performance).
|
||||
If no `--developer-mode` command line option is defined, Scylla defaults to running with developer mode *enabled*.
|
||||
It is highly recommended to disable developer mode for production deployments to ensure Scylla is able to run with maximum performance.
|
||||
|
||||
For example, to disable developer mode:
|
||||
|
||||
```console
|
||||
$ docker run --name some-scylla -d scylladb/scylla --developer-mode 0
|
||||
```
|
||||
|
||||
# User Feedback
|
||||
|
||||
## Issues
|
||||
|
||||
For bug reports, please use Scylla's [issue tracker](https://github.com/scylladb/scylla/issues) on GitHub.
|
||||
Please read the [How to report a Scylla problem](https://github.com/scylladb/scylla/wiki/How-to-report-a-Scylla-problem) page before you report bugs.
|
||||
|
||||
For general help, see Scylla's [documentation](http://www.scylladb.com/doc/).
|
||||
For questions and comments, use Scylla's [mailing lists](http://www.scylladb.com/community/).
|
||||
|
||||
## Contributing
|
||||
|
||||
Want to scratch your own itch and contribute a patch.
|
||||
We are eager to review and merge your code.
|
||||
Please consult the [Contributing on Scylla page](http://www.scylladb.com/kb/contributing/)
|
||||
@@ -55,6 +55,7 @@
|
||||
#include "log.hh"
|
||||
#include <seastar/core/sleep.hh>
|
||||
#include <seastar/core/thread.hh>
|
||||
#include <seastar/core/scollectd.hh>
|
||||
#include <chrono>
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include <boost/range/algorithm/set_algorithm.hpp>
|
||||
@@ -112,6 +113,19 @@ gossiper::gossiper() {
|
||||
/* register with the Failure Detector for receiving Failure detector events */
|
||||
get_local_failure_detector().register_failure_detection_event_listener(this);
|
||||
// Register this instance with JMX
|
||||
_collectd_registrations = std::make_unique<scollectd::registrations>(setup_collectd());
|
||||
}
|
||||
|
||||
scollectd::registrations
|
||||
gossiper::setup_collectd() {
|
||||
auto ep = get_broadcast_address();
|
||||
return {
|
||||
scollectd::add_polled_metric(
|
||||
scollectd::type_instance_id("gossip", scollectd::per_cpu_plugin_instance,
|
||||
"derive", "heart_beat_version"),
|
||||
scollectd::make_typed(scollectd::data_type::DERIVE, [ep, this] {
|
||||
return this->endpoint_state_map.at(ep).get_heart_beat_state().get_heart_beat_version(); })),
|
||||
};
|
||||
}
|
||||
|
||||
void gossiper::set_last_processed_message_at() {
|
||||
|
||||
@@ -56,6 +56,7 @@
|
||||
#include <chrono>
|
||||
#include <set>
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/core/scollectd.hh>
|
||||
|
||||
namespace gms {
|
||||
|
||||
@@ -537,6 +538,9 @@ private:
|
||||
void register_feature(feature* f);
|
||||
void unregister_feature(feature* f);
|
||||
void maybe_enable_features();
|
||||
private:
|
||||
std::unique_ptr<scollectd::registrations> _collectd_registrations;
|
||||
scollectd::registrations setup_collectd();
|
||||
};
|
||||
|
||||
extern distributed<gossiper> _the_gossiper;
|
||||
|
||||
105
mutation.cc
105
mutation.cc
@@ -213,51 +213,90 @@ mutation& mutation::operator=(const mutation& m) {
|
||||
return *this = mutation(m);
|
||||
}
|
||||
|
||||
future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm)
|
||||
{
|
||||
class rebuilder {
|
||||
mutation& _m;
|
||||
public:
|
||||
rebuilder(mutation& m) : _m(m) { }
|
||||
enum class limit_mutation_size { yes, no };
|
||||
|
||||
stop_iteration consume(tombstone t) {
|
||||
_m.partition().apply(t);
|
||||
return stop_iteration::no;
|
||||
template <limit_mutation_size with_limit>
|
||||
class mutation_rebuilder {
|
||||
mutation _m;
|
||||
streamed_mutation& _sm;
|
||||
size_t _remaining_limit;
|
||||
|
||||
template <typename T> bool check_remaining_limit(const T& e) {
|
||||
if (with_limit == limit_mutation_size::no) {
|
||||
return true;
|
||||
}
|
||||
|
||||
stop_iteration consume(static_row&& sr) {
|
||||
_m.partition().static_row().apply(*_m.schema(), column_kind::static_column, std::move(sr.cells()));
|
||||
return stop_iteration::no;
|
||||
size_t size = e.memory_usage();
|
||||
if (_remaining_limit <= size) {
|
||||
_remaining_limit = 0;
|
||||
} else {
|
||||
_remaining_limit -= size;
|
||||
}
|
||||
return _remaining_limit > 0;
|
||||
}
|
||||
public:
|
||||
mutation_rebuilder(streamed_mutation& sm)
|
||||
: _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(0) {
|
||||
static_assert(with_limit == limit_mutation_size::no,
|
||||
"This constructor should be used only for mutation_rebuildeer with no limit");
|
||||
}
|
||||
mutation_rebuilder(streamed_mutation& sm, size_t limit)
|
||||
: _m(sm.decorated_key(), sm.schema()), _sm(sm), _remaining_limit(limit) {
|
||||
static_assert(with_limit == limit_mutation_size::yes,
|
||||
"This constructor should be used only for mutation_rebuildeer with limit");
|
||||
check_remaining_limit(_m.key());
|
||||
}
|
||||
|
||||
stop_iteration consume(range_tombstone&& rt) {
|
||||
_m.partition().apply_row_tombstone(*_m.schema(), std::move(rt));
|
||||
return stop_iteration::no;
|
||||
stop_iteration consume(tombstone t) {
|
||||
_m.partition().apply(t);
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume(range_tombstone&& rt) {
|
||||
if (!check_remaining_limit(rt)) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m.partition().apply_row_tombstone(*_m.schema(), std::move(rt));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume(clustering_row&& cr) {
|
||||
auto& dr = _m.partition().clustered_row(std::move(cr.key()));
|
||||
dr.apply(cr.tomb());
|
||||
dr.apply(cr.marker());
|
||||
dr.cells().apply(*_m.schema(), column_kind::regular_column, std::move(cr.cells()));
|
||||
return stop_iteration::no;
|
||||
stop_iteration consume(static_row&& sr) {
|
||||
if (!check_remaining_limit(sr)) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m.partition().static_row().apply(*_m.schema(), column_kind::static_column, std::move(sr.cells()));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
void consume_end_of_stream() { }
|
||||
};
|
||||
stop_iteration consume(clustering_row&& cr) {
|
||||
if (!check_remaining_limit(cr)) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
auto& dr = _m.partition().clustered_row(std::move(cr.key()));
|
||||
dr.apply(cr.tomb());
|
||||
dr.apply(cr.marker());
|
||||
dr.cells().apply(*_m.schema(), column_kind::regular_column, std::move(cr.cells()));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
struct data {
|
||||
mutation m;
|
||||
streamed_mutation sm;
|
||||
};
|
||||
mutation_opt consume_end_of_stream() {
|
||||
return with_limit == limit_mutation_size::yes && _remaining_limit == 0 ? mutation_opt()
|
||||
: mutation_opt(std::move(_m));
|
||||
}
|
||||
};
|
||||
|
||||
future<mutation_opt>
|
||||
mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit) {
|
||||
return do_with(std::move(sm), [limit] (auto& sm) {
|
||||
return consume(sm, mutation_rebuilder<limit_mutation_size::yes>(sm, limit));
|
||||
});
|
||||
}
|
||||
|
||||
future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm) {
|
||||
if (!sm) {
|
||||
return make_ready_future<mutation_opt>();
|
||||
}
|
||||
mutation m(sm->decorated_key(), sm->schema());
|
||||
return do_with(data { std::move(m), std::move(*sm) }, [] (auto& d) {
|
||||
return consume(d.sm, rebuilder(d.m)).then([&d] {
|
||||
return mutation_opt(std::move(d.m));
|
||||
});
|
||||
return do_with(std::move(*sm), [] (auto& sm) {
|
||||
return consume(sm, mutation_rebuilder<limit_mutation_size::no>(sm));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -182,3 +182,5 @@ boost::iterator_range<std::vector<mutation>::const_iterator> slice(
|
||||
const query::partition_range&);
|
||||
|
||||
future<mutation_opt> mutation_from_streamed_mutation(streamed_mutation_opt sm);
|
||||
future<mutation_opt>
|
||||
mutation_from_streamed_mutation_with_limit(streamed_mutation sm, size_t limit);
|
||||
|
||||
@@ -231,7 +231,7 @@ public:
|
||||
}
|
||||
|
||||
_row_limit -= _rows_in_current_partition;
|
||||
_partition_limit -= 1;
|
||||
_partition_limit -= _rows_in_current_partition > 0;
|
||||
_consumer.consume_end_of_partition();
|
||||
if (!sstable_compaction()) {
|
||||
return _row_limit && _partition_limit ? stop_iteration::no : stop_iteration::yes;
|
||||
@@ -253,4 +253,4 @@ struct compact_for_query : compact_mutation<only_live, compact_for_sstables::no,
|
||||
template<typename CompactedMutationsConsumer>
|
||||
struct compact_for_compaction : compact_mutation<emit_only_live_rows::no, compact_for_sstables::yes, CompactedMutationsConsumer> {
|
||||
using compact_mutation<emit_only_live_rows::no, compact_for_sstables::yes, CompactedMutationsConsumer>::compact_mutation;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -1780,8 +1780,9 @@ public:
|
||||
}
|
||||
|
||||
void consume_end_of_partition() {
|
||||
_live_rows += _mutation_consumer->consume_end_of_stream();
|
||||
_partitions += 1;
|
||||
auto live_rows_in_partition = _mutation_consumer->consume_end_of_stream();
|
||||
_live_rows += live_rows_in_partition;
|
||||
_partitions += live_rows_in_partition > 0;
|
||||
}
|
||||
|
||||
data_query_result consume_end_of_stream() {
|
||||
|
||||
@@ -33,6 +33,13 @@
|
||||
// marking the end of iteration. After calling mutation_reader's operator(),
|
||||
// caller must keep the object alive until the returned future is fulfilled.
|
||||
//
|
||||
// streamed_mutation object emitted by mutation_reader remains valid after the
|
||||
// destruction of the mutation_reader.
|
||||
//
|
||||
// Asking mutation_reader for another streamed_mutation (i.e. invoking
|
||||
// mutation_reader::operator()) invalidates all streamed_mutation objects
|
||||
// previously produced by that reader.
|
||||
//
|
||||
// The mutations returned have strictly monotonically increasing keys. Two
|
||||
// consecutive mutations never have equal keys.
|
||||
//
|
||||
|
||||
@@ -62,29 +62,37 @@ partition_version::~partition_version()
|
||||
}
|
||||
|
||||
partition_snapshot::~partition_snapshot() {
|
||||
if (_version) {
|
||||
if (_version && _version.is_unique_owner()) {
|
||||
auto v = &*_version;
|
||||
if (_version.is_unique_owner()) {
|
||||
_version = { };
|
||||
remove_or_mark_as_unique_owner(v);
|
||||
} else {
|
||||
_version = { };
|
||||
auto first_used = v;
|
||||
while (first_used->prev() && !first_used->is_referenced()) {
|
||||
first_used = first_used->prev();
|
||||
}
|
||||
_version = {};
|
||||
remove_or_mark_as_unique_owner(v);
|
||||
} else if (_entry) {
|
||||
_entry->_snapshot = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto current = first_used->next();
|
||||
while (current && !current->is_referenced()) {
|
||||
auto next = current->next();
|
||||
void partition_snapshot::merge_partition_versions() {
|
||||
if (_version && !_version.is_unique_owner()) {
|
||||
auto v = &*_version;
|
||||
_version = { };
|
||||
auto first_used = v;
|
||||
while (first_used->prev() && !first_used->is_referenced()) {
|
||||
first_used = first_used->prev();
|
||||
}
|
||||
|
||||
auto current = first_used->next();
|
||||
while (current && !current->is_referenced()) {
|
||||
auto next = current->next();
|
||||
try {
|
||||
first_used->partition().apply(*_schema, std::move(current->partition()));
|
||||
current_allocator().destroy(current);
|
||||
current = next;
|
||||
} catch (...) {
|
||||
// Set _version so that the merge can be retried.
|
||||
_version = partition_version_ref(*current);
|
||||
throw;
|
||||
}
|
||||
current = next;
|
||||
}
|
||||
} else {
|
||||
assert(_entry);
|
||||
_entry->_snapshot = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -308,9 +316,20 @@ partition_snapshot_reader::partition_snapshot_reader(schema_ptr s, dht::decorate
|
||||
|
||||
partition_snapshot_reader::~partition_snapshot_reader()
|
||||
{
|
||||
if (!_snapshot.owned()) {
|
||||
return;
|
||||
}
|
||||
// If no one else is using this particular snapshot try to merge partition
|
||||
// versions.
|
||||
with_allocator(_lsa_region.allocator(), [this] {
|
||||
logalloc::reclaim_lock _(_lsa_region);
|
||||
_snapshot = { };
|
||||
return with_linearized_managed_bytes([this] {
|
||||
try {
|
||||
_read_section(_lsa_region, [this] {
|
||||
_snapshot->merge_partition_versions();
|
||||
_snapshot = {};
|
||||
});
|
||||
} catch (...) { }
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -192,6 +192,11 @@ public:
|
||||
partition_snapshot& operator=(const partition_snapshot&) = delete;
|
||||
partition_snapshot& operator=(partition_snapshot&&) = delete;
|
||||
|
||||
// If possible merges the version pointed to by this snapshot with
|
||||
// adjacent partition versions. Leaves the snapshot in an unspecified state.
|
||||
// Can be retried if previous merge attempt has failed.
|
||||
void merge_partition_versions();
|
||||
|
||||
~partition_snapshot();
|
||||
|
||||
partition_version_ref& version();
|
||||
|
||||
@@ -170,7 +170,7 @@ public:
|
||||
uint32_t row_limit;
|
||||
gc_clock::time_point timestamp;
|
||||
std::experimental::optional<tracing::trace_info> trace_info;
|
||||
uint32_t partition_limit;
|
||||
uint32_t partition_limit; // The maximum number of live partitions to return.
|
||||
api::timestamp_type read_timestamp; // not serialized
|
||||
public:
|
||||
read_command(utils::UUID cf_id,
|
||||
|
||||
@@ -294,6 +294,11 @@ public:
|
||||
return _current_tombstone;
|
||||
}
|
||||
|
||||
const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
|
||||
drop_unneeded_tombstones(ck);
|
||||
return _range_tombstones;
|
||||
}
|
||||
|
||||
void apply(const range_tombstone& rt);
|
||||
|
||||
void clear();
|
||||
|
||||
265
row_cache.cc
265
row_cache.cc
@@ -38,6 +38,22 @@ static logging::logger logger("cache");
|
||||
|
||||
thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduling_group(1ms, 0.2);
|
||||
|
||||
enum class is_wide_partition { yes, no };
|
||||
|
||||
future<is_wide_partition, mutation_opt>
|
||||
try_to_read(uint64_t max_cached_partition_size_in_bytes, streamed_mutation_opt&& sm) {
|
||||
if (!sm) {
|
||||
return make_ready_future<is_wide_partition, mutation_opt>(is_wide_partition::no, mutation_opt());
|
||||
}
|
||||
return mutation_from_streamed_mutation_with_limit(std::move(*sm), max_cached_partition_size_in_bytes).then(
|
||||
[] (mutation_opt&& omo) mutable {
|
||||
if (omo) {
|
||||
return make_ready_future<is_wide_partition, mutation_opt>(is_wide_partition::no, std::move(omo));
|
||||
} else {
|
||||
return make_ready_future<is_wide_partition, mutation_opt>(is_wide_partition::yes, mutation_opt());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
cache_tracker& global_cache_tracker() {
|
||||
static thread_local cache_tracker instance;
|
||||
@@ -59,7 +75,7 @@ cache_tracker::cache_tracker() {
|
||||
cache_entry& ce = _lru.back();
|
||||
auto it = row_cache::partitions_type::s_iterator_to(ce);
|
||||
--it;
|
||||
it->set_continuous(false);
|
||||
clear_continuity(*it);
|
||||
_lru.pop_back_and_dispose(current_deleter<cache_entry>());
|
||||
--_partitions;
|
||||
++_evictions;
|
||||
@@ -103,6 +119,11 @@ cache_tracker::setup_collectd() {
|
||||
, "total_operations", "misses")
|
||||
, scollectd::make_typed(scollectd::data_type::DERIVE, _misses)
|
||||
),
|
||||
scollectd::add_polled_metric(scollectd::type_instance_id("cache"
|
||||
, scollectd::per_cpu_plugin_instance
|
||||
, "total_operations", "uncached_wide_partitions")
|
||||
, scollectd::make_typed(scollectd::data_type::DERIVE, _uncached_wide_partitions)
|
||||
),
|
||||
scollectd::add_polled_metric(scollectd::type_instance_id("cache"
|
||||
, scollectd::per_cpu_plugin_instance
|
||||
, "total_operations", "insertions")
|
||||
@@ -142,7 +163,7 @@ void cache_tracker::clear() {
|
||||
_lru.erase(_lru.iterator_to(to_remove));
|
||||
current_deleter<cache_entry>()(&to_remove);
|
||||
}
|
||||
it->set_continuous(false);
|
||||
clear_continuity(*it);
|
||||
}
|
||||
});
|
||||
_removals += _partitions;
|
||||
@@ -180,6 +201,14 @@ void cache_tracker::on_miss() {
|
||||
++_misses;
|
||||
}
|
||||
|
||||
void cache_tracker::on_uncached_wide_partition() {
|
||||
++_uncached_wide_partitions;
|
||||
}
|
||||
|
||||
void cache_tracker::on_continuity_flag_cleared() {
|
||||
++_continuity_flags_cleared;
|
||||
}
|
||||
|
||||
allocation_strategy& cache_tracker::allocator() {
|
||||
return _region.allocator();
|
||||
}
|
||||
@@ -196,33 +225,60 @@ const logalloc::region& cache_tracker::region() const {
|
||||
class single_partition_populating_reader final : public mutation_reader::impl {
|
||||
schema_ptr _schema;
|
||||
row_cache& _cache;
|
||||
mutation_source& _underlying;
|
||||
mutation_reader _delegate;
|
||||
const io_priority_class _pc;
|
||||
query::clustering_key_filtering_context _ck_filtering;
|
||||
query::partition_range _large_partition_range;
|
||||
mutation_reader _large_partition_reader;
|
||||
public:
|
||||
single_partition_populating_reader(schema_ptr s, row_cache& cache, mutation_reader delegate, query::clustering_key_filtering_context ck_filtering)
|
||||
single_partition_populating_reader(schema_ptr s, row_cache& cache, mutation_source& underlying,
|
||||
mutation_reader delegate, const io_priority_class pc, query::clustering_key_filtering_context ck_filtering)
|
||||
: _schema(std::move(s))
|
||||
, _cache(cache)
|
||||
, _underlying(underlying)
|
||||
, _delegate(std::move(delegate))
|
||||
, _pc(pc)
|
||||
, _ck_filtering(ck_filtering)
|
||||
{ }
|
||||
|
||||
virtual future<streamed_mutation_opt> operator()() override {
|
||||
return _delegate().then([] (auto sm) {
|
||||
return mutation_from_streamed_mutation(std::move(sm));
|
||||
}).then([this, op = _cache._populate_phaser.start()] (mutation_opt&& mo) -> streamed_mutation_opt {
|
||||
if (mo) {
|
||||
_cache.populate(*mo);
|
||||
mo->upgrade(_schema);
|
||||
auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
|
||||
auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
|
||||
mo->partition() = std::move(filtered_partition);
|
||||
return streamed_mutation_from_mutation(std::move(*mo));
|
||||
auto op = _cache._populate_phaser.start();
|
||||
return _delegate().then([this, op = std::move(op)] (auto sm) mutable {
|
||||
if (!sm) {
|
||||
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
||||
}
|
||||
return { };
|
||||
dht::decorated_key dk = sm->decorated_key();
|
||||
return try_to_read(_cache._max_cached_partition_size_in_bytes, std::move(sm)).then(
|
||||
[this, op = std::move(op), dk = std::move(dk)]
|
||||
(is_wide_partition wide_partition, mutation_opt&& mo) {
|
||||
if (wide_partition == is_wide_partition::no) {
|
||||
if (mo) {
|
||||
_cache.populate(*mo);
|
||||
mo->upgrade(_schema);
|
||||
auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
|
||||
auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
|
||||
mo->partition() = std::move(filtered_partition);
|
||||
return make_ready_future<streamed_mutation_opt>(streamed_mutation_from_mutation(std::move(*mo)));
|
||||
}
|
||||
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
||||
} else {
|
||||
_cache.on_uncached_wide_partition();
|
||||
_cache.mark_partition_as_wide(dk);
|
||||
_large_partition_range = query::partition_range::make_singular(std::move(dk));
|
||||
_large_partition_reader = _underlying(_schema, _large_partition_range, _ck_filtering, _pc);
|
||||
return _large_partition_reader();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
void cache_tracker::clear_continuity(cache_entry& ce) {
|
||||
ce.set_continuous(false);
|
||||
on_continuity_flag_cleared();
|
||||
}
|
||||
|
||||
void row_cache::on_hit() {
|
||||
_stats.hits.mark();
|
||||
_tracker.on_hit();
|
||||
@@ -233,6 +289,10 @@ void row_cache::on_miss() {
|
||||
_tracker.on_miss();
|
||||
}
|
||||
|
||||
void row_cache::on_uncached_wide_partition() {
|
||||
_tracker.on_uncached_wide_partition();
|
||||
}
|
||||
|
||||
class just_cache_scanning_reader final {
|
||||
schema_ptr _schema;
|
||||
row_cache& _cache;
|
||||
@@ -243,6 +303,7 @@ class just_cache_scanning_reader final {
|
||||
uint64_t _last_reclaim_count;
|
||||
size_t _last_modification_count;
|
||||
query::clustering_key_filtering_context _ck_filtering;
|
||||
const io_priority_class _pc;
|
||||
private:
|
||||
void update_iterators() {
|
||||
auto cmp = cache_entry::compare(_cache._schema);
|
||||
@@ -285,10 +346,12 @@ private:
|
||||
public:
|
||||
struct cache_data {
|
||||
streamed_mutation_opt mut;
|
||||
uint64_t continuity_flags_cleared;
|
||||
bool continuous;
|
||||
};
|
||||
just_cache_scanning_reader(schema_ptr s, row_cache& cache, const query::partition_range& range, query::clustering_key_filtering_context ck_filtering)
|
||||
: _schema(std::move(s)), _cache(cache), _range(range), _ck_filtering(ck_filtering)
|
||||
just_cache_scanning_reader(schema_ptr s, row_cache& cache, const query::partition_range& range,
|
||||
query::clustering_key_filtering_context ck_filtering, const io_priority_class& pc)
|
||||
: _schema(std::move(s)), _cache(cache), _range(range), _ck_filtering(ck_filtering), _pc(pc)
|
||||
{ }
|
||||
future<cache_data> operator()() {
|
||||
return _cache._read_section(_cache._tracker.region(), [this] {
|
||||
@@ -301,8 +364,19 @@ public:
|
||||
++_it;
|
||||
_last = ce.key();
|
||||
_cache.upgrade_entry(ce);
|
||||
cache_data data{std::move(ce.read(_cache, _schema, _ck_filtering)), ce.continuous()};
|
||||
return make_ready_future<cache_data>(std::move(data));
|
||||
cache_data cd { { }, _cache._tracker.continuity_flags_cleared(), ce.continuous() };
|
||||
if (ce.wide_partition()) {
|
||||
return ce.read_wide(_cache, _schema, _ck_filtering, _pc).then([this, cd = std::move(cd)] (auto smopt) mutable {
|
||||
if (smopt) {
|
||||
cd.mut = std::move(*smopt);
|
||||
} else {
|
||||
cd.mut = streamed_mutation_from_mutation(mutation(_last->as_decorated_key(), _schema));
|
||||
}
|
||||
return std::move(cd);
|
||||
});
|
||||
}
|
||||
cd.mut = ce.read(_cache, _schema, _ck_filtering);
|
||||
return make_ready_future<cache_data>(std::move(cd));
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -346,6 +420,8 @@ class range_populating_reader final : public mutation_reader::impl {
|
||||
std::experimental::optional<dht::ring_position> _last_key;
|
||||
utils::phased_barrier::phase_type _last_key_populate_phase;
|
||||
mark_end_as_continuous _make_last_entry_continuous;
|
||||
query::partition_range _large_partition_range;
|
||||
mutation_reader _large_partition_reader;
|
||||
|
||||
void update_reader() {
|
||||
if (_populate_phase != _cache._populate_phaser.phase()) {
|
||||
@@ -372,6 +448,12 @@ class range_populating_reader final : public mutation_reader::impl {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void update_last_key(const dht::decorated_key& key) {
|
||||
this->maybe_mark_last_entry_as_continuous(mark_end_as_continuous(mark_end_as_continuous::override(), true));
|
||||
_last_key = dht::ring_position(key);
|
||||
_last_key_populate_phase = _cache._populate_phaser.phase();
|
||||
}
|
||||
public:
|
||||
range_populating_reader(
|
||||
row_cache& cache,
|
||||
@@ -397,22 +479,44 @@ public:
|
||||
{}
|
||||
virtual future<streamed_mutation_opt> operator()() override {
|
||||
update_reader();
|
||||
return _reader().then([] (auto sm) {
|
||||
return mutation_from_streamed_mutation(std::move(sm));
|
||||
}).then([this, op = _cache._populate_phaser.start()] (mutation_opt&& mo) -> streamed_mutation_opt {
|
||||
if (mo) {
|
||||
_cache.populate(*mo);
|
||||
mo->upgrade(_schema);
|
||||
maybe_mark_last_entry_as_continuous(mark_end_as_continuous(mark_end_as_continuous::override(), true));
|
||||
_last_key = dht::ring_position(mo->decorated_key());
|
||||
_last_key_populate_phase = _cache._populate_phaser.phase();
|
||||
auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
|
||||
auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
|
||||
mo->partition() = std::move(filtered_partition);
|
||||
return streamed_mutation_from_mutation(std::move(*mo));
|
||||
}
|
||||
maybe_mark_last_entry_as_continuous(_make_last_entry_continuous);
|
||||
return {};
|
||||
auto op = _cache._populate_phaser.start();
|
||||
return _reader().then([this, op = std::move(op)] (auto sm) mutable {
|
||||
stdx::optional<dht::decorated_key> dk = (sm) ? stdx::optional<dht::decorated_key>(sm->decorated_key())
|
||||
: stdx::optional<dht::decorated_key>(stdx::nullopt);
|
||||
return try_to_read(_cache._max_cached_partition_size_in_bytes, std::move(sm)).then(
|
||||
[this, op = std::move(op), dk = std::move(dk)]
|
||||
(is_wide_partition wide_partition, mutation_opt&& mo) mutable {
|
||||
if (wide_partition == is_wide_partition::no) {
|
||||
if (mo) {
|
||||
_cache.populate(*mo);
|
||||
mo->upgrade(_schema);
|
||||
this->update_last_key(mo->decorated_key());
|
||||
auto& ck_ranges = _ck_filtering.get_ranges(mo->key());
|
||||
auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), ck_ranges);
|
||||
mo->partition() = std::move(filtered_partition);
|
||||
return make_ready_future<streamed_mutation_opt>(streamed_mutation_from_mutation(std::move(*mo)));
|
||||
}
|
||||
this->maybe_mark_last_entry_as_continuous(_make_last_entry_continuous);
|
||||
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
||||
} else {
|
||||
assert(bool(dk));
|
||||
this->update_last_key(*dk);
|
||||
_cache.on_uncached_wide_partition();
|
||||
_cache.mark_partition_as_wide(*dk);
|
||||
_large_partition_range = query::partition_range::make_singular(*dk);
|
||||
_large_partition_reader = _underlying(_schema, _large_partition_range, _ck_filtering, _pc);
|
||||
return _large_partition_reader().then([this, dk = std::move(*dk)] (auto smopt) mutable -> streamed_mutation_opt {
|
||||
_large_partition_reader = {};
|
||||
if (!smopt) {
|
||||
// We cannot emit disengaged optional since this is a part of range
|
||||
// read and it would incorrectly interpreted as end of stream.
|
||||
// Produce empty mutation instead.
|
||||
return streamed_mutation_from_mutation(mutation(std::move(dk), _schema));
|
||||
}
|
||||
return smopt;
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
@@ -454,6 +558,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
|
||||
just_cache_scanning_reader _primary;
|
||||
last_key _last_key_from_primary;
|
||||
utils::phased_barrier::phase_type _last_key_from_primary_populate_phase;
|
||||
uint64_t _last_key_from_primary_continuity_flags_cleared;
|
||||
query::clustering_key_filtering_context _ck_filtering;
|
||||
boost::variant<end_state,
|
||||
secondary_only_state,
|
||||
@@ -466,6 +571,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
|
||||
if (!bound_opt) {
|
||||
_last_key_from_primary = {_cache._partitions.begin()->key(), true};
|
||||
_last_key_from_primary_populate_phase = _cache._populate_phaser.phase();
|
||||
_last_key_from_primary_continuity_flags_cleared = _cache._tracker.continuity_flags_cleared();
|
||||
return _cache._partitions.begin()->continuous();
|
||||
}
|
||||
const range_bound<dht::ring_position>& bound = bound_opt.value();
|
||||
@@ -481,6 +587,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
|
||||
(!bound.is_inclusive() || bound.value().relation_to_keys() == -1)) {
|
||||
_last_key_from_primary = {i->key(), true};
|
||||
_last_key_from_primary_populate_phase = _cache._populate_phaser.phase();
|
||||
_last_key_from_primary_continuity_flags_cleared = _cache._tracker.continuity_flags_cleared();
|
||||
return i->continuous();
|
||||
}
|
||||
--i;
|
||||
@@ -512,6 +619,9 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
|
||||
// We have to capture mutation from data before we change the state because data lives in state
|
||||
// and changing state destroys previous state.
|
||||
streamed_mutation_opt result = std::move(data.mut);
|
||||
if (_cache._tracker.continuity_flags_cleared() != data.continuity_flags_cleared) {
|
||||
data.continuous = _cache.has_continuous_entry(*_last_key_from_primary.value);
|
||||
}
|
||||
if (data.continuous) {
|
||||
_state = after_continuous_entry_state{};
|
||||
} else {
|
||||
@@ -545,7 +655,7 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
|
||||
, _schema(std::move(s))
|
||||
, _range(range)
|
||||
, _pc(pc)
|
||||
, _primary(_schema, _cache, _range, ck_filtering)
|
||||
, _primary(_schema, _cache, _range, ck_filtering, pc)
|
||||
, _ck_filtering(ck_filtering)
|
||||
, _state(start_state{}) {}
|
||||
future<streamed_mutation_opt> operator()(const end_state& state) {
|
||||
@@ -580,6 +690,11 @@ class scanning_and_populating_reader final : public mutation_reader::impl{
|
||||
});
|
||||
}
|
||||
future<streamed_mutation_opt> operator()(after_continuous_entry_state& state) {
|
||||
if (_last_key_from_primary_continuity_flags_cleared != _cache._tracker.continuity_flags_cleared()
|
||||
&& !_cache.has_continuous_entry(*_last_key_from_primary.value)) {
|
||||
_state = after_not_continuous_entry_state{};
|
||||
return operator()();
|
||||
}
|
||||
return _primary().then([this] (just_cache_scanning_reader::cache_data&& data) {
|
||||
if (!data.mut) {
|
||||
switch_to_end();
|
||||
@@ -675,11 +790,15 @@ row_cache::make_reader(schema_ptr s,
|
||||
_tracker.touch(e);
|
||||
on_hit();
|
||||
upgrade_entry(e);
|
||||
if (e.wide_partition()) {
|
||||
_tracker.on_uncached_wide_partition();
|
||||
return _underlying(s, range, ck_filtering, pc);
|
||||
}
|
||||
return make_reader_returning(e.read(*this, s, ck_filtering));
|
||||
} else {
|
||||
on_miss();
|
||||
return make_mutation_reader<single_partition_populating_reader>(s, *this,
|
||||
_underlying(_schema, range, query::no_clustering_key_filtering, pc),
|
||||
return make_mutation_reader<single_partition_populating_reader>(s, *this, _underlying,
|
||||
_underlying(_schema, range, query::no_clustering_key_filtering, pc), pc,
|
||||
ck_filtering);
|
||||
}
|
||||
});
|
||||
@@ -708,7 +827,25 @@ void row_cache::clear_now() noexcept {
|
||||
deleter(p);
|
||||
});
|
||||
}
|
||||
_partitions.begin()->set_continuous(false);
|
||||
_tracker.clear_continuity(*_partitions.begin());
|
||||
});
|
||||
}
|
||||
|
||||
void row_cache::mark_partition_as_wide(const dht::decorated_key& key) {
|
||||
with_allocator(_tracker.allocator(), [this, &key] {
|
||||
_populate_section(_tracker.region(), [&] {
|
||||
with_linearized_managed_bytes([&] {
|
||||
auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
|
||||
if (i == _partitions.end() || !i->key().equal(*_schema, key)) {
|
||||
cache_entry* entry = current_allocator().construct<cache_entry>(
|
||||
_schema, key, cache_entry::wide_partition_tag{});
|
||||
_tracker.insert(*entry);
|
||||
_partitions.insert(i, *entry);
|
||||
} else {
|
||||
i->set_wide_partition();
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -783,11 +920,13 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec
|
||||
// FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to
|
||||
// search it.
|
||||
if (cache_i != _partitions.end() && cache_i->key().equal(*_schema, mem_e.key())) {
|
||||
if (!cache_i->wide_partition()) {
|
||||
cache_entry& entry = *cache_i;
|
||||
upgrade_entry(entry);
|
||||
entry.partition().apply(*_schema, std::move(mem_e.partition()), *mem_e.schema());
|
||||
_tracker.touch(entry);
|
||||
_tracker.on_merge();
|
||||
}
|
||||
} else if (presence_checker(mem_e.key().key()) ==
|
||||
partition_presence_checker_result::definitely_doesnt_exist) {
|
||||
cache_entry* entry = current_allocator().construct<cache_entry>(
|
||||
@@ -796,7 +935,7 @@ future<> row_cache::update(memtable& m, partition_presence_checker presence_chec
|
||||
_partitions.insert(cache_i, *entry);
|
||||
} else {
|
||||
--cache_i;
|
||||
cache_i->set_continuous(false);
|
||||
_tracker.clear_continuity(*cache_i);
|
||||
}
|
||||
i = m.partitions.erase(i);
|
||||
current_allocator().destroy(&mem_e);
|
||||
@@ -832,10 +971,10 @@ void row_cache::touch(const dht::decorated_key& dk) {
|
||||
void row_cache::invalidate_locked(const dht::decorated_key& dk) {
|
||||
auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema));
|
||||
if (pos == _partitions.end()) {
|
||||
_partitions.rbegin()->set_continuous(false);
|
||||
_tracker.clear_continuity(*_partitions.rbegin());
|
||||
} else if (!pos->key().equal(*_schema, dk)) {
|
||||
--pos;
|
||||
pos->set_continuous(false);
|
||||
_tracker.clear_continuity(*pos);
|
||||
} else {
|
||||
auto end = pos;
|
||||
++end;
|
||||
@@ -846,7 +985,7 @@ void row_cache::invalidate_locked(const dht::decorated_key& dk) {
|
||||
});
|
||||
assert (it != _partitions.begin());
|
||||
--it;
|
||||
it->set_continuous(false);
|
||||
_tracker.clear_continuity(*it);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -907,17 +1046,32 @@ void row_cache::invalidate_unwrapped(const query::partition_range& range) {
|
||||
});
|
||||
assert(it != _partitions.begin());
|
||||
--it;
|
||||
it->set_continuous(false);
|
||||
_tracker.clear_continuity(*it);
|
||||
});
|
||||
}
|
||||
|
||||
bool row_cache::has_continuous_entry(const dht::ring_position& key) const {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
|
||||
if (i == _partitions.end()) {
|
||||
return _partitions.rbegin()->continuous();
|
||||
}
|
||||
if (!i->key().equal(*_schema, key)) {
|
||||
--i;
|
||||
return i->continuous();
|
||||
}
|
||||
return i->continuous();
|
||||
});
|
||||
}
|
||||
|
||||
row_cache::row_cache(schema_ptr s, mutation_source fallback_factory, key_source underlying_keys,
|
||||
cache_tracker& tracker)
|
||||
cache_tracker& tracker, uint64_t max_cached_partition_size_in_bytes)
|
||||
: _tracker(tracker)
|
||||
, _schema(std::move(s))
|
||||
, _partitions(cache_entry::compare(_schema))
|
||||
, _underlying(std::move(fallback_factory))
|
||||
, _underlying_keys(std::move(underlying_keys))
|
||||
, _max_cached_partition_size_in_bytes(max_cached_partition_size_in_bytes)
|
||||
{
|
||||
with_allocator(_tracker.allocator(), [this] {
|
||||
cache_entry* entry = current_allocator().construct<cache_entry>(_schema);
|
||||
@@ -930,6 +1084,7 @@ cache_entry::cache_entry(cache_entry&& o) noexcept
|
||||
, _key(std::move(o._key))
|
||||
, _pe(std::move(o._pe))
|
||||
, _continuous(o._continuous)
|
||||
, _wide_partition(o._wide_partition)
|
||||
, _lru_link()
|
||||
, _cache_link()
|
||||
{
|
||||
@@ -950,11 +1105,29 @@ void row_cache::set_schema(schema_ptr new_schema) noexcept {
|
||||
_schema = std::move(new_schema);
|
||||
}
|
||||
|
||||
future<streamed_mutation_opt> cache_entry::read_wide(row_cache& rc, schema_ptr s, query::clustering_key_filtering_context ck_filtering, const io_priority_class& pc) {
|
||||
struct range_and_underlyig_reader {
|
||||
query::partition_range _range;
|
||||
mutation_reader _reader;
|
||||
range_and_underlyig_reader(row_cache& rc, schema_ptr s, query::partition_range pr,
|
||||
query::clustering_key_filtering_context ck_filtering, const io_priority_class& pc)
|
||||
: _range(std::move(pr))
|
||||
, _reader(rc._underlying(s, _range, ck_filtering, pc))
|
||||
{ }
|
||||
};
|
||||
rc._tracker.on_uncached_wide_partition();
|
||||
auto pr = query::partition_range::make_singular(_key);
|
||||
return do_with(range_and_underlyig_reader(rc, s, std::move(pr), std::move(ck_filtering), pc), [] (auto& r_a_ur) {
|
||||
return r_a_ur._reader();
|
||||
});
|
||||
}
|
||||
|
||||
streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s) {
|
||||
return read(rc, s, query::no_clustering_key_filtering);
|
||||
}
|
||||
|
||||
streamed_mutation cache_entry::read(row_cache& rc, const schema_ptr& s, query::clustering_key_filtering_context ck_filtering) {
|
||||
assert(!wide_partition());
|
||||
auto dk = _key.as_decorated_key();
|
||||
if (_schema->version() != s->version()) {
|
||||
const query::clustering_row_ranges& ck_ranges = ck_filtering.get_ranges(dk.key());
|
||||
@@ -973,6 +1146,10 @@ const schema_ptr& row_cache::schema() const {
|
||||
|
||||
void row_cache::upgrade_entry(cache_entry& e) {
|
||||
if (e._schema != _schema) {
|
||||
if (e.wide_partition()) {
|
||||
e._schema = _schema;
|
||||
return;
|
||||
}
|
||||
auto& r = _tracker.region();
|
||||
assert(!r.reclaiming_enabled());
|
||||
with_allocator(r.allocator(), [this, &e] {
|
||||
|
||||
42
row_cache.hh
42
row_cache.hh
@@ -62,7 +62,8 @@ class cache_entry {
|
||||
dht::ring_position _key;
|
||||
partition_entry _pe;
|
||||
// True when we know that there is nothing between this entry and the next one in cache
|
||||
bool _continuous;
|
||||
bool _continuous : 1;
|
||||
bool _wide_partition : 1;
|
||||
lru_link_type _lru_link;
|
||||
cache_link_type _cache_link;
|
||||
friend class size_calculator;
|
||||
@@ -73,8 +74,17 @@ public:
|
||||
cache_entry(schema_ptr s)
|
||||
: _schema(std::move(s))
|
||||
, _key(dht::ring_position::starting_at(dht::minimum_token()))
|
||||
, _pe(_schema)
|
||||
, _continuous(false)
|
||||
, _wide_partition(false)
|
||||
{ }
|
||||
|
||||
struct wide_partition_tag{};
|
||||
|
||||
cache_entry(schema_ptr s, const dht::decorated_key& key, wide_partition_tag)
|
||||
: _schema(std::move(s))
|
||||
, _key(key)
|
||||
, _continuous(false)
|
||||
, _wide_partition(true)
|
||||
{ }
|
||||
|
||||
cache_entry(schema_ptr s, const dht::decorated_key& key, const mutation_partition& p, bool continuous = false)
|
||||
@@ -82,6 +92,7 @@ public:
|
||||
, _key(key)
|
||||
, _pe(p)
|
||||
, _continuous(continuous)
|
||||
, _wide_partition(false)
|
||||
{ }
|
||||
|
||||
cache_entry(schema_ptr s, dht::decorated_key&& key, mutation_partition&& p, bool continuous = false) noexcept
|
||||
@@ -89,6 +100,7 @@ public:
|
||||
, _key(std::move(key))
|
||||
, _pe(std::move(p))
|
||||
, _continuous(continuous)
|
||||
, _wide_partition(false)
|
||||
{ }
|
||||
|
||||
cache_entry(schema_ptr s, dht::decorated_key&& key, partition_entry&& pe, bool continuous = false) noexcept
|
||||
@@ -96,6 +108,7 @@ public:
|
||||
, _key(std::move(key))
|
||||
, _pe(std::move(pe))
|
||||
, _continuous(continuous)
|
||||
, _wide_partition(false)
|
||||
{ }
|
||||
|
||||
cache_entry(cache_entry&&) noexcept;
|
||||
@@ -106,10 +119,19 @@ public:
|
||||
partition_entry& partition() { return _pe; }
|
||||
const schema_ptr& schema() const { return _schema; }
|
||||
schema_ptr& schema() { return _schema; }
|
||||
// Requires: !wide_partition()
|
||||
streamed_mutation read(row_cache&, const schema_ptr&);
|
||||
// Requires: !wide_partition()
|
||||
streamed_mutation read(row_cache&, const schema_ptr&, query::clustering_key_filtering_context);
|
||||
// May return disengaged optional if the partition is empty.
|
||||
future<streamed_mutation_opt> read_wide(row_cache&, schema_ptr, query::clustering_key_filtering_context, const io_priority_class&);
|
||||
bool continuous() const { return _continuous; }
|
||||
void set_continuous(bool value) { _continuous = value; }
|
||||
bool wide_partition() const { return _wide_partition; }
|
||||
void set_wide_partition() {
|
||||
_wide_partition = true;
|
||||
_pe = {};
|
||||
}
|
||||
|
||||
struct compare {
|
||||
dht::ring_position_less_comparator _c;
|
||||
@@ -149,12 +171,14 @@ public:
|
||||
private:
|
||||
uint64_t _hits = 0;
|
||||
uint64_t _misses = 0;
|
||||
uint64_t _uncached_wide_partitions = 0;
|
||||
uint64_t _insertions = 0;
|
||||
uint64_t _merges = 0;
|
||||
uint64_t _evictions = 0;
|
||||
uint64_t _removals = 0;
|
||||
uint64_t _partitions = 0;
|
||||
uint64_t _modification_count = 0;
|
||||
uint64_t _continuity_flags_cleared = 0;
|
||||
std::unique_ptr<scollectd::registrations> _collectd_registrations;
|
||||
logalloc::region _region;
|
||||
lru_type _lru;
|
||||
@@ -166,15 +190,20 @@ public:
|
||||
void clear();
|
||||
void touch(cache_entry&);
|
||||
void insert(cache_entry&);
|
||||
void clear_continuity(cache_entry& ce);
|
||||
void on_erase();
|
||||
void on_merge();
|
||||
void on_hit();
|
||||
void on_miss();
|
||||
void on_uncached_wide_partition();
|
||||
void on_continuity_flag_cleared();
|
||||
allocation_strategy& allocator();
|
||||
logalloc::region& region();
|
||||
const logalloc::region& region() const;
|
||||
uint64_t modification_count() const { return _modification_count; }
|
||||
uint64_t partitions() const { return _partitions; }
|
||||
uint64_t uncached_wide_partitions() const { return _uncached_wide_partitions; }
|
||||
uint64_t continuity_flags_cleared() const { return _continuity_flags_cleared; }
|
||||
};
|
||||
|
||||
// Returns a reference to shard-wide cache_tracker.
|
||||
@@ -211,6 +240,7 @@ private:
|
||||
partitions_type _partitions; // Cached partitions are complete.
|
||||
mutation_source _underlying;
|
||||
key_source _underlying_keys;
|
||||
uint64_t _max_cached_partition_size_in_bytes;
|
||||
|
||||
// Synchronizes populating reads with updates of underlying data source to ensure that cache
|
||||
// remains consistent across flushes with the underlying data source.
|
||||
@@ -231,6 +261,7 @@ private:
|
||||
query::clustering_key_filtering_context ck_filtering);
|
||||
void on_hit();
|
||||
void on_miss();
|
||||
void on_uncached_wide_partition();
|
||||
void upgrade_entry(cache_entry&);
|
||||
void invalidate_locked(const dht::decorated_key&);
|
||||
void invalidate_unwrapped(const query::partition_range&);
|
||||
@@ -238,7 +269,7 @@ private:
|
||||
static thread_local seastar::thread_scheduling_group _update_thread_scheduling_group;
|
||||
public:
|
||||
~row_cache();
|
||||
row_cache(schema_ptr, mutation_source underlying, key_source, cache_tracker&);
|
||||
row_cache(schema_ptr, mutation_source underlying, key_source, cache_tracker&, uint64_t _max_cached_partition_size_in_bytes = 10 * 1024 * 1024);
|
||||
row_cache(row_cache&&) = default;
|
||||
row_cache(const row_cache&) = delete;
|
||||
row_cache& operator=(row_cache&&) = default;
|
||||
@@ -258,6 +289,9 @@ public:
|
||||
// information there is for its partition in the underlying data sources.
|
||||
void populate(const mutation& m);
|
||||
|
||||
// Caches an information that a partition with a given key is wide.
|
||||
void mark_partition_as_wide(const dht::decorated_key& key);
|
||||
|
||||
// Clears the cache.
|
||||
// Guarantees that cache will not be populated using readers created
|
||||
// before this method was invoked.
|
||||
@@ -289,6 +323,8 @@ public:
|
||||
// The range must be kept alive until method resolves.
|
||||
future<> invalidate(const query::partition_range&);
|
||||
|
||||
bool has_continuous_entry(const dht::ring_position& key) const;
|
||||
|
||||
auto num_entries() const {
|
||||
return _partitions.size();
|
||||
}
|
||||
|
||||
95
schema.cc
95
schema.cc
@@ -56,6 +56,14 @@ sstring to_sstring(index_type t) {
|
||||
throw std::invalid_argument("unknown index type");
|
||||
}
|
||||
|
||||
bool is_regular(column_kind k) {
|
||||
return k == column_kind::regular_column || k == column_kind::compact_column;
|
||||
}
|
||||
|
||||
bool is_compatible(column_kind k1, column_kind k2) {
|
||||
return k1 == k2 || (is_regular(k1) && is_regular(k2));
|
||||
}
|
||||
|
||||
column_mapping_entry::column_mapping_entry(bytes name, sstring type_name)
|
||||
: _name(std::move(name))
|
||||
, _type(db::marshal::type_parser::parse(type_name))
|
||||
@@ -635,51 +643,60 @@ schema_builder& schema_builder::with_version(table_schema_version v) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
schema_ptr schema_builder::build() {
|
||||
if (_version) {
|
||||
_raw._version = *_version;
|
||||
} else {
|
||||
_raw._version = utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
void schema_builder::prepare_dense_schema(schema::raw_schema& raw) {
|
||||
if (raw._is_dense) {
|
||||
auto regular_cols = boost::copy_range<std::vector<column_definition*>>(
|
||||
raw._columns | boost::adaptors::filtered([](auto&& col) { return col.is_regular(); })
|
||||
| boost::adaptors::transformed([](auto&& col) { return &col; }));
|
||||
|
||||
if (!_compact_storage) {
|
||||
return make_lw_shared<schema>(schema(_raw));
|
||||
}
|
||||
|
||||
schema s(_raw);
|
||||
|
||||
// Dense means that no part of the comparator stores a CQL column name. This means
|
||||
// COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
|
||||
s._raw._is_dense = (*_compact_storage == compact_storage::yes) && (s.clustering_key_size() > 0);
|
||||
|
||||
if (s.clustering_key_size() == 0) {
|
||||
if (*_compact_storage == compact_storage::yes) {
|
||||
s._raw._is_compound = false;
|
||||
} else {
|
||||
s._raw._is_compound = true;
|
||||
}
|
||||
} else {
|
||||
if ((*_compact_storage == compact_storage::yes) && s.clustering_key_size() == 1) {
|
||||
s._raw._is_compound = false;
|
||||
} else {
|
||||
s._raw._is_compound = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (s._raw._is_dense) {
|
||||
// In Origin, dense CFs always have at least one regular column
|
||||
if (s.regular_columns_count() == 0) {
|
||||
s._raw._columns.emplace_back(bytes(""), s.regular_column_name_type(), column_kind::regular_column, 0, index_info());
|
||||
if (regular_cols.empty()) {
|
||||
raw._columns.emplace_back(bytes(""), raw._regular_column_name_type, column_kind::compact_column, 0, index_info());
|
||||
return;
|
||||
}
|
||||
|
||||
if (s.regular_columns_count() != 1) {
|
||||
throw exceptions::configuration_exception(sprint("Expecting exactly one regular column. Found %d", s.regular_columns_count()));
|
||||
if (regular_cols.size() != 1) {
|
||||
throw exceptions::configuration_exception(sprint("Expecting exactly one regular column. Found %d", regular_cols.size()));
|
||||
}
|
||||
s._raw._columns.at(s.column_offset(column_kind::regular_column)).kind = column_kind::compact_column;
|
||||
|
||||
regular_cols[0]->kind = column_kind::compact_column;
|
||||
}
|
||||
// We need to rebuild the schema in case we added some column. This is way simpler than trying to factor out the relevant code
|
||||
// from the constructor
|
||||
return make_lw_shared<schema>(schema(s._raw));
|
||||
}
|
||||
|
||||
schema_ptr schema_builder::build() {
|
||||
schema::raw_schema new_raw = _raw; // Copy so that build() remains idempotent.
|
||||
|
||||
if (_version) {
|
||||
new_raw._version = *_version;
|
||||
} else {
|
||||
new_raw._version = utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
if (_compact_storage) {
|
||||
// Dense means that no part of the comparator stores a CQL column name. This means
|
||||
// COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
|
||||
auto clustering_key_size = std::count_if(new_raw._columns.begin(), new_raw._columns.end(), [](auto&& col) {
|
||||
return col.kind == column_kind::clustering_key;
|
||||
});
|
||||
new_raw._is_dense = (*_compact_storage == compact_storage::yes) && (clustering_key_size > 0);
|
||||
|
||||
if (clustering_key_size == 0) {
|
||||
if (*_compact_storage == compact_storage::yes) {
|
||||
new_raw._is_compound = false;
|
||||
} else {
|
||||
new_raw._is_compound = true;
|
||||
}
|
||||
} else {
|
||||
if ((*_compact_storage == compact_storage::yes) && clustering_key_size == 1) {
|
||||
new_raw._is_compound = false;
|
||||
} else {
|
||||
new_raw._is_compound = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
prepare_dense_schema(new_raw);
|
||||
return make_lw_shared<schema>(schema(new_raw));
|
||||
}
|
||||
|
||||
schema_ptr schema_builder::build(compact_storage cp) {
|
||||
|
||||
@@ -72,6 +72,8 @@ void read_collections(schema_builder& builder, sstring comparator);
|
||||
enum class column_kind { partition_key, clustering_key, static_column, regular_column, compact_column };
|
||||
|
||||
sstring to_sstring(column_kind k);
|
||||
bool is_regular(column_kind k);
|
||||
bool is_compatible(column_kind k1, column_kind k2);
|
||||
|
||||
// CMH this is also manually defined in thrift gen file.
|
||||
enum class index_type {
|
||||
@@ -225,7 +227,7 @@ public:
|
||||
index_info idx_info;
|
||||
|
||||
bool is_static() const { return kind == column_kind::static_column; }
|
||||
bool is_regular() const { return kind == column_kind::regular_column || kind == column_kind::compact_column; }
|
||||
bool is_regular() const { return ::is_regular(kind); }
|
||||
bool is_partition_key() const { return kind == column_kind::partition_key; }
|
||||
bool is_clustering_key() const { return kind == column_kind::clustering_key; }
|
||||
bool is_primary_key() const { return kind == column_kind::partition_key || kind == column_kind::clustering_key; }
|
||||
|
||||
@@ -220,4 +220,6 @@ public:
|
||||
schema_ptr build(compact_storage cp);
|
||||
|
||||
schema_ptr build();
|
||||
private:
|
||||
void prepare_dense_schema(schema::raw_schema& raw);
|
||||
};
|
||||
|
||||
37
scripts/scylla_current_repo
Executable file
37
scripts/scylla_current_repo
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
VERSION=$(./SCYLLA-VERSION-GEN)
|
||||
SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE)
|
||||
SCYLLA_RELEASE=$(cat build/SCYLLA-RELEASE-FILE)
|
||||
|
||||
. /etc/os-release
|
||||
|
||||
if [ "$SCYLLA_VERSION" = "666.development" ]; then
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
CODENAME=`lsb_release -c|awk '{print $2}'`
|
||||
if [ "$CODENAME" = "trusty" ]; then
|
||||
CODENAME=ubuntu
|
||||
fi
|
||||
echo https://downloads.scylladb.com/deb/unstable/$CODENAME/master/latest/scylla.list
|
||||
elif [ "$ID" = "centos" ]; then
|
||||
echo https://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
|
||||
elif [ "$ID" = "fedora" ]; then
|
||||
echo https://downloads.scylladb.com/rpm/unstable/fedora/master/latest/scylla.repo
|
||||
else
|
||||
echo "Unsupported distribution."
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
REPO_VERSION=$(echo $SCYLLA_VERSION |sed -e "s/^\([0-9]*\.[0-9]*\).*/\1/")
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
CODENAME=`lsb_release -c|awk '{print $2}'`
|
||||
echo http://downloads.scylladb.com/deb/ubuntu/scylla-$REPO_VERSION-$CODENAME.list
|
||||
elif [ "$ID" = "centos" ]; then
|
||||
echo http://downloads.scylladb.com/rpm/centos/scylla-$REPO_VERSION.repo
|
||||
elif [ "$ID" = "fedora" ]; then
|
||||
echo http://downloads.scylladb.com/rpm/fedora/scylla-$REPO_VERSION.repo
|
||||
else
|
||||
echo "Unsupported distribution."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
@@ -10,7 +10,7 @@ fi
|
||||
print_usage() {
|
||||
echo "scylla_install_pkg --local-pkg /home/scylla/rpms --repo [URL]"
|
||||
echo " --local-pkg install locally built .rpm/.deb on specified directory"
|
||||
echo " --repo specify repository URL"
|
||||
echo " --repo specify .repo/.list file URL"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -42,10 +42,8 @@ if [ "$ID" = "ubuntu" ]; then
|
||||
chmod +x /usr/sbin/policy-rc.d
|
||||
cp /etc/hosts /etc/hosts.orig
|
||||
echo 127.0.0.1 `hostname` >> /etc/hosts
|
||||
if [ "$REPO" = "" ]; then
|
||||
echo "deb http://s3.amazonaws.com/downloads.scylladb.com/deb/ubuntu trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
|
||||
else
|
||||
echo "deb $REPO trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
|
||||
if [ "$REPO" != "" ]; then
|
||||
curl -o /etc/apt/sources.list.d/scylla.list $REPO
|
||||
fi
|
||||
apt-get update
|
||||
if [ "$LOCAL_PKG" = "" ]; then
|
||||
@@ -54,27 +52,25 @@ if [ "$ID" = "ubuntu" ]; then
|
||||
if [ ! -f /usr/bin/gdebi ]; then
|
||||
apt-get install -y --force-yes gdebi-core
|
||||
fi
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-kernel-conf*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-conf*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-server*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-server_*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-server-dbg*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla_*.deb
|
||||
fi
|
||||
mv /etc/hosts.orig /etc/hosts
|
||||
rm /usr/sbin/policy-rc.d
|
||||
else
|
||||
if [ "$ID" = "fedora" ]; then
|
||||
if [ "$REPO" = "" ]; then
|
||||
curl http://downloads.scylladb.com/rpm/fedora/scylla.repo > /etc/yum.repos.d/scylla.repo
|
||||
else
|
||||
curl $REPO > /etc/yum.repos.d/scylla.repo
|
||||
fi
|
||||
elif [ "$ID" = "centos" ] || [ "$ID" = "rhel" ]; then
|
||||
if [ "$REPO" = "" ]; then
|
||||
curl http://downloads.scylladb.com/rpm/centos/scylla.repo > /etc/yum.repos.d/scylla.repo
|
||||
else
|
||||
curl $REPO > /etc/yum.repos.d/scylla.repo
|
||||
fi
|
||||
yum install -y epel-release
|
||||
if [ "$REPO" != "" ]; then
|
||||
curl -o /etc/yum.repos.d/scylla.repo $REPO
|
||||
fi
|
||||
|
||||
if [ "$ID" = "centos" ]; then
|
||||
yum install -y epel-release
|
||||
elif [ "$ID" = "rhel" ]; then
|
||||
rpm -ivh http://download.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-7.noarch.rpm
|
||||
else
|
||||
echo "Unsupported distribution"
|
||||
exit 1
|
||||
@@ -83,6 +79,6 @@ else
|
||||
if [ "$LOCAL_PKG" = "" ]; then
|
||||
yum install -y scylla
|
||||
else
|
||||
yum install -y $LOCAL_PKG/scylla-conf*.x86_64.rpm $LOCAL_PKG/scylla-server*.x86_64.rpm $LOCAL_PKG/scylla-jmx*.noarch.rpm $LOCAL_PKG/scylla-tools*.noarch.rpm
|
||||
yum install -y $LOCAL_PKG/scylla-*.*.rpm
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/python3
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# Copyright (C) 2016 ScyllaDB
|
||||
#
|
||||
@@ -19,11 +19,18 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import urllib
|
||||
import urllib2
|
||||
import requests
|
||||
import ConfigParser
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
from pkg_resources import parse_version
|
||||
|
||||
VERSION = "1.0"
|
||||
quiet = False
|
||||
@@ -39,29 +46,42 @@ def traceln(*vals):
|
||||
def help(args):
|
||||
parser.print_help()
|
||||
|
||||
def sh_command(*args):
|
||||
p = subprocess.Popen(args, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
out, err = p.communicate()
|
||||
if err:
|
||||
raise Exception(err)
|
||||
return out
|
||||
|
||||
def get_json_from_url(path):
|
||||
req = urllib.request.Request(path)
|
||||
try:
|
||||
response = urllib.request.urlopen(req)
|
||||
data = response.read()
|
||||
encoding = response.info().get_content_charset('utf-8')
|
||||
return json.loads(data.decode(encoding))
|
||||
except urllib.error.URLError as e:
|
||||
pass
|
||||
return ""
|
||||
data = sh_command("curl", "-s", "-X", "GET", path)
|
||||
return json.loads(data)
|
||||
|
||||
def get_api(path):
|
||||
return get_json_from_url("http://localhost:10000" + path)
|
||||
|
||||
def version_compare(a, b):
|
||||
return parse_version(a) < parse_version(b)
|
||||
|
||||
def check_version(ar):
|
||||
if config and (not config.has_option("housekeeping", "check-version") or not config.getboolean("housekeeping", "check-version")):
|
||||
return
|
||||
current_version = get_api('/storage_service/scylla_release_version')
|
||||
latest_version = get_json_from_url(version_url)["version"]
|
||||
|
||||
if current_version != latest_version:
|
||||
if current_version == "":
|
||||
# API is down, nothing to do
|
||||
return
|
||||
try:
|
||||
latest_version = get_json_from_url(version_url + "?version=" + current_version)["version"]
|
||||
except:
|
||||
traceln("Unable to retrieve version information")
|
||||
return
|
||||
if version_compare(current_version, latest_version):
|
||||
traceln("A new version was found, current version=", current_version, " latest version=", latest_version)
|
||||
|
||||
parser = argparse.ArgumentParser(description='ScyllaDB help report tool', conflict_handler="resolve")
|
||||
parser.add_argument('-q', '--quiet', action='store_true', default=False, help='Quiet mode')
|
||||
parser.add_argument('-c', '--config', default="", help='An optional config file. Specifying a missing file will terminate the script')
|
||||
|
||||
subparsers = parser.add_subparsers(help='Available commands')
|
||||
parser_help = subparsers.add_parser('help', help='Display help information')
|
||||
@@ -71,4 +91,11 @@ parser_system.set_defaults(func=check_version)
|
||||
|
||||
args = parser.parse_args()
|
||||
quiet = args.quiet
|
||||
config = None
|
||||
if args.config != "":
|
||||
if not os.path.isfile(args.config):
|
||||
traceln("Config file ", args.config, " is missing, terminating")
|
||||
sys.exit(0)
|
||||
config = ConfigParser.SafeConfigParser()
|
||||
config.read(args.config)
|
||||
args.func(args)
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 103543aca1...e6571c4774
@@ -166,7 +166,8 @@ private:
|
||||
);
|
||||
|
||||
auto ranges = _ranges;
|
||||
return get_local_storage_proxy().query(_schema, _cmd, std::move(ranges),
|
||||
auto command = ::make_lw_shared<query::read_command>(*_cmd);
|
||||
return get_local_storage_proxy().query(_schema, std::move(command), std::move(ranges),
|
||||
_options.get_consistency(), _state.get_trace_state()).then(
|
||||
[this, &builder, page_size, now](foreign_ptr<lw_shared_ptr<query::result>> results) {
|
||||
handle_result(builder, std::move(results), page_size, now);
|
||||
|
||||
@@ -2128,10 +2128,13 @@ protected:
|
||||
future<foreign_ptr<lw_shared_ptr<reconcilable_result>>> make_mutation_data_request(lw_shared_ptr<query::read_command> cmd, gms::inet_address ep, clock_type::time_point timeout) {
|
||||
++_proxy->_stats.mutation_data_read_attempts.get_ep_stat(ep);
|
||||
if (is_me(ep)) {
|
||||
tracing::trace(_trace_state, "read_mutation_data: querying locally");
|
||||
return _proxy->query_mutations_locally(_schema, cmd, _partition_range);
|
||||
} else {
|
||||
auto& ms = net::get_local_messaging_service();
|
||||
return ms.send_read_mutation_data(net::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this](reconcilable_result&& result) {
|
||||
tracing::trace(_trace_state, "read_mutation_data: sending a message to /{}", ep);
|
||||
return ms.send_read_mutation_data(net::messaging_service::msg_addr{ep, 0}, timeout, *cmd, _partition_range).then([this, ep](reconcilable_result&& result) {
|
||||
tracing::trace(_trace_state, "read_mutation_data: got response from /{}", ep);
|
||||
return make_foreign(::make_lw_shared<reconcilable_result>(std::move(result)));
|
||||
});
|
||||
}
|
||||
@@ -2139,10 +2142,13 @@ protected:
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>> make_data_request(gms::inet_address ep, clock_type::time_point timeout) {
|
||||
++_proxy->_stats.data_read_attempts.get_ep_stat(ep);
|
||||
if (is_me(ep)) {
|
||||
tracing::trace(_trace_state, "read_data: querying locally");
|
||||
return _proxy->query_singular_local(_schema, _cmd, _partition_range);
|
||||
} else {
|
||||
auto& ms = net::get_local_messaging_service();
|
||||
return ms.send_read_data(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this](query::result&& result) {
|
||||
tracing::trace(_trace_state, "read_data: sending a message to /{}", ep);
|
||||
return ms.send_read_data(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this, ep](query::result&& result) {
|
||||
tracing::trace(_trace_state, "read_data: got response from /{}", ep);
|
||||
return make_foreign(::make_lw_shared<query::result>(std::move(result)));
|
||||
});
|
||||
}
|
||||
@@ -2150,10 +2156,13 @@ protected:
|
||||
future<query::result_digest, api::timestamp_type> make_digest_request(gms::inet_address ep, clock_type::time_point timeout) {
|
||||
++_proxy->_stats.digest_read_attempts.get_ep_stat(ep);
|
||||
if (is_me(ep)) {
|
||||
tracing::trace(_trace_state, "read_digest: querying locally");
|
||||
return _proxy->query_singular_local_digest(_schema, _cmd, _partition_range);
|
||||
} else {
|
||||
auto& ms = net::get_local_messaging_service();
|
||||
return ms.send_read_digest(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([] (query::result_digest d, rpc::optional<api::timestamp_type> t) {
|
||||
tracing::trace(_trace_state, "read_digest: sending a message to /{}", ep);
|
||||
return ms.send_read_digest(net::messaging_service::msg_addr{ep, 0}, timeout, *_cmd, _partition_range).then([this, ep] (query::result_digest d, rpc::optional<api::timestamp_type> t) {
|
||||
tracing::trace(_trace_state, "read_digest: got response from /{}", ep);
|
||||
return make_ready_future<query::result_digest, api::timestamp_type>(d, t ? t.value() : api::missing_timestamp);
|
||||
});
|
||||
}
|
||||
@@ -3271,10 +3280,11 @@ void storage_proxy::init_messaging_service() {
|
||||
}
|
||||
|
||||
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr)] (const query::partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
|
||||
auto src_ip = src_addr.addr;
|
||||
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p] (schema_ptr s) {
|
||||
return p->query_singular_local(std::move(s), cmd, pr);
|
||||
}).finally([&trace_state_ptr] () mutable {
|
||||
tracing::trace(trace_state_ptr, "read_data handling is done");
|
||||
}).finally([&trace_state_ptr, src_ip] () mutable {
|
||||
tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -3287,10 +3297,11 @@ void storage_proxy::init_messaging_service() {
|
||||
tracing::trace(trace_state_ptr, "read_mutation_data: message received from /{}", src_addr.addr);
|
||||
}
|
||||
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr)] (const query::partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
|
||||
auto src_ip = src_addr.addr;
|
||||
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p] (schema_ptr s) {
|
||||
return p->query_mutations_locally(std::move(s), cmd, pr);
|
||||
}).finally([&trace_state_ptr] () mutable {
|
||||
tracing::trace(trace_state_ptr, "read_mutation_data handling is done");
|
||||
}).finally([&trace_state_ptr, src_ip] () mutable {
|
||||
tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -3303,10 +3314,11 @@ void storage_proxy::init_messaging_service() {
|
||||
tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr);
|
||||
}
|
||||
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr)] (const query::partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
|
||||
auto src_ip = src_addr.addr;
|
||||
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p] (schema_ptr s) {
|
||||
return p->query_singular_local_digest(std::move(s), cmd, pr);
|
||||
}).finally([&trace_state_ptr] () mutable {
|
||||
tracing::trace(trace_state_ptr, "read_digest handling is done");
|
||||
}).finally([&trace_state_ptr, src_ip] () mutable {
|
||||
tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -3394,7 +3406,7 @@ public:
|
||||
|
||||
boost::range::make_heap(_runs, cmp);
|
||||
|
||||
return repeat_until_value([this, cmp = std::move(cmp), partitions = std::vector<partition>(), row_count = 0u] () mutable {
|
||||
return repeat_until_value([this, cmp = std::move(cmp), partitions = std::vector<partition>(), row_count = 0u, partition_count = 0u] () mutable {
|
||||
std::experimental::optional<reconcilable_result> ret;
|
||||
|
||||
boost::range::pop_heap(_runs, cmp);
|
||||
@@ -3414,6 +3426,7 @@ public:
|
||||
partitions.push_back(p);
|
||||
row_count += p._row_count;
|
||||
}
|
||||
partition_count += p._row_count > 0;
|
||||
if (row_count < _cmd->row_limit) {
|
||||
next.advance();
|
||||
if (next.has_more()) {
|
||||
@@ -3422,7 +3435,7 @@ public:
|
||||
_runs.pop_back();
|
||||
}
|
||||
}
|
||||
if (_runs.empty() || row_count >= _cmd->row_limit) {
|
||||
if (_runs.empty() || row_count >= _cmd->row_limit || partition_count >= _cmd->partition_limit) {
|
||||
ret = reconcilable_result(row_count, std::move(partitions));
|
||||
}
|
||||
return make_ready_future<std::experimental::optional<reconcilable_result>>(std::move(ret));
|
||||
|
||||
@@ -297,6 +297,9 @@ public:
|
||||
*
|
||||
* Partitions for each range will be ordered according to decorated_key ordering. Results for
|
||||
* each range from "partition_ranges" may appear in any order.
|
||||
*
|
||||
* IMPORTANT: Not all fibers started by this method have to be done by the time it returns so no
|
||||
* parameter can be changed after being passed to this method.
|
||||
*/
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>> query(schema_ptr,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
|
||||
@@ -559,9 +559,9 @@ public:
|
||||
auto orig_map = get_range_to_address_map(keyspace, get_tokens_in_local_dc());
|
||||
std::unordered_map<range<token>, std::vector<inet_address>> filtered_map;
|
||||
for (auto entry : orig_map) {
|
||||
filtered_map[entry.first].reserve(entry.second.size());
|
||||
std::remove_copy_if(entry.second.begin(), entry.second.end(),
|
||||
filtered_map[entry.first].begin(), filter);
|
||||
auto& addresses = filtered_map[entry.first];
|
||||
addresses.reserve(entry.second.size());
|
||||
std::copy_if(entry.second.begin(), entry.second.end(), std::back_inserter(addresses), filter);
|
||||
}
|
||||
|
||||
return filtered_map;
|
||||
|
||||
@@ -40,67 +40,37 @@
|
||||
#pragma once
|
||||
|
||||
#include "core/sstring.hh"
|
||||
#include "schema.hh"
|
||||
#include "compound_compat.hh"
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
class column_name_helper {
|
||||
private:
|
||||
static void may_grow(std::vector<bytes>& v, size_t target_size) {
|
||||
static inline void may_grow(std::vector<bytes_opt>& v, size_t target_size) {
|
||||
if (target_size > v.size()) {
|
||||
v.resize(target_size);
|
||||
}
|
||||
}
|
||||
public:
|
||||
static void min_max_components(std::vector<bytes>& min_seen, std::vector<bytes>& max_seen, const std::vector<bytes_view>& column_names) {
|
||||
may_grow(min_seen, column_names.size());
|
||||
may_grow(max_seen, column_names.size());
|
||||
template <typename T>
|
||||
static void min_max_components(const schema& schema, std::vector<bytes_opt>& min_seen, std::vector<bytes_opt>& max_seen, T components) {
|
||||
may_grow(min_seen, schema.clustering_key_size());
|
||||
may_grow(max_seen, schema.clustering_key_size());
|
||||
|
||||
for (auto i = 0U; i < column_names.size(); i++) {
|
||||
auto& name = column_names[i];
|
||||
if (max_seen[i].size() == 0 || name > bytes_view(max_seen[i])) {
|
||||
max_seen[i] = bytes(name.data(), name.size());
|
||||
auto& types = schema.clustering_key_type()->types();
|
||||
auto i = 0U;
|
||||
for (auto& value : components) {
|
||||
auto& type = types[i];
|
||||
|
||||
if (!max_seen[i] || type->compare(value, max_seen[i].value()) > 0) {
|
||||
max_seen[i] = bytes(value.data(), value.size());
|
||||
}
|
||||
if (min_seen[i].size() == 0 || name < bytes_view(min_seen[i])) {
|
||||
min_seen[i] = bytes(name.data(), name.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void merge_max_components(std::vector<bytes>& to, std::vector<bytes>&& from) {
|
||||
if (to.empty()) {
|
||||
to = std::move(from);
|
||||
return;
|
||||
}
|
||||
|
||||
if (from.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
may_grow(to, from.size());
|
||||
|
||||
for (auto i = 0U; i < from.size(); i++) {
|
||||
if (to[i].size() == 0 || bytes_view(from[i]) > bytes_view(to[i])) {
|
||||
to[i] = std::move(from[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void merge_min_components(std::vector<bytes>& to, std::vector<bytes>&& from) {
|
||||
if (to.empty()) {
|
||||
to = std::move(from);
|
||||
}
|
||||
|
||||
if (from.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
may_grow(to, from.size());
|
||||
|
||||
for (auto i = 0U; i < from.size(); i++) {
|
||||
if (to[i].size() == 0 || bytes_view(from[i]) < bytes_view(to[i])) {
|
||||
to[i] = std::move(from[i]);
|
||||
if (!min_seen[i] || type->compare(value, min_seen[i].value()) < 0) {
|
||||
min_seen[i] = bytes(value.data(), value.size());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -262,10 +262,10 @@ public:
|
||||
// We received more data than we actually care about, so process
|
||||
// the beginning of the buffer, and return the rest to the stream
|
||||
auto segment = data.share(0, _remain);
|
||||
process(segment);
|
||||
auto ret = process(segment);
|
||||
data.trim_front(_remain - segment.size());
|
||||
_remain -= (_remain - segment.size());
|
||||
if (_remain == 0) {
|
||||
if (_remain == 0 && ret == proceed::yes) {
|
||||
verify_end_state();
|
||||
}
|
||||
return make_ready_future<unconsumed_remainder>(std::move(data));
|
||||
|
||||
@@ -48,24 +48,93 @@
|
||||
#include <iterator>
|
||||
#include "sstables.hh"
|
||||
#include "compaction.hh"
|
||||
#include "timestamp.hh"
|
||||
#include "cql3/statements/property_definitions.hh"
|
||||
|
||||
static constexpr double DEFAULT_MAX_SSTABLE_AGE_DAYS = 365;
|
||||
static constexpr int64_t DEFAULT_BASE_TIME_SECONDS = 60;
|
||||
|
||||
struct duration_conversor {
|
||||
// Convert given duration to TargetDuration and return value as timestamp.
|
||||
template <typename TargetDuration, typename SourceDuration>
|
||||
static api::timestamp_type convert(SourceDuration d) {
|
||||
return std::chrono::duration_cast<TargetDuration>(d).count();
|
||||
}
|
||||
|
||||
// Convert given duration to duration that is represented by the string
|
||||
// target_duration, and return value as timestamp.
|
||||
template <typename SourceDuration>
|
||||
static api::timestamp_type convert(const sstring& target_duration, SourceDuration d) {
|
||||
if (target_duration == "HOURS") {
|
||||
return convert<std::chrono::hours>(d);
|
||||
} else if (target_duration == "MICROSECONDS") {
|
||||
return convert<std::chrono::microseconds>(d);
|
||||
} else if (target_duration == "MILLISECONDS") {
|
||||
return convert<std::chrono::milliseconds>(d);
|
||||
} else if (target_duration == "MINUTES") {
|
||||
return convert<std::chrono::minutes>(d);
|
||||
} else if (target_duration == "NANOSECONDS") {
|
||||
return convert<std::chrono::nanoseconds>(d);
|
||||
} else if (target_duration == "SECONDS") {
|
||||
return convert<std::chrono::seconds>(d);
|
||||
} else {
|
||||
throw std::runtime_error(sprint("target duration %s is not available", target_duration));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class date_tiered_compaction_strategy_options {
|
||||
const sstring DEFAULT_TIMESTAMP_RESOLUTION = "MICROSECONDS";
|
||||
const sstring TIMESTAMP_RESOLUTION_KEY = "timestamp_resolution";
|
||||
const sstring MAX_SSTABLE_AGE_KEY = "max_sstable_age_days";
|
||||
const sstring BASE_TIME_KEY = "base_time_seconds";
|
||||
|
||||
api::timestamp_type max_sstable_age;
|
||||
api::timestamp_type base_time;
|
||||
public:
|
||||
date_tiered_compaction_strategy_options(const std::map<sstring, sstring>& options) {
|
||||
using namespace cql3::statements;
|
||||
|
||||
auto tmp_value = get_value(options, TIMESTAMP_RESOLUTION_KEY);
|
||||
auto target_unit = tmp_value ? tmp_value.value() : DEFAULT_TIMESTAMP_RESOLUTION;
|
||||
|
||||
tmp_value = get_value(options, MAX_SSTABLE_AGE_KEY);
|
||||
auto fractional_days = property_definitions::to_double(MAX_SSTABLE_AGE_KEY, tmp_value, DEFAULT_MAX_SSTABLE_AGE_DAYS);
|
||||
int64_t max_sstable_age_in_hours = std::lround(fractional_days * 24);
|
||||
max_sstable_age = duration_conversor::convert(target_unit, std::chrono::hours(max_sstable_age_in_hours));
|
||||
|
||||
tmp_value = get_value(options, BASE_TIME_KEY);
|
||||
auto base_time_seconds = property_definitions::to_long(BASE_TIME_KEY, tmp_value, DEFAULT_BASE_TIME_SECONDS);
|
||||
base_time = duration_conversor::convert(target_unit, std::chrono::seconds(base_time_seconds));
|
||||
}
|
||||
|
||||
date_tiered_compaction_strategy_options() {
|
||||
auto max_sstable_age_in_hours = int64_t(DEFAULT_MAX_SSTABLE_AGE_DAYS * 24);
|
||||
max_sstable_age = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::hours(max_sstable_age_in_hours)).count();
|
||||
base_time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::seconds(DEFAULT_BASE_TIME_SECONDS)).count();
|
||||
}
|
||||
private:
|
||||
static std::experimental::optional<sstring> get_value(const std::map<sstring, sstring>& options, const sstring& name) {
|
||||
auto it = options.find(name);
|
||||
if (it == options.end()) {
|
||||
return std::experimental::nullopt;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
friend class date_tiered_manifest;
|
||||
};
|
||||
|
||||
class date_tiered_manifest {
|
||||
static logging::logger logger;
|
||||
|
||||
// TODO: implement date_tiered_compaction_strategy_options.
|
||||
db_clock::duration _max_sstable_age;
|
||||
db_clock::duration _base_time;
|
||||
date_tiered_compaction_strategy_options _options;
|
||||
public:
|
||||
date_tiered_manifest() = delete;
|
||||
|
||||
date_tiered_manifest(const std::map<sstring, sstring>& options) {
|
||||
auto max_sstable_age_in_hours = int64_t(DEFAULT_MAX_SSTABLE_AGE_DAYS * 24);
|
||||
_max_sstable_age = std::chrono::duration_cast<db_clock::duration>(std::chrono::hours(max_sstable_age_in_hours));
|
||||
_base_time = std::chrono::duration_cast<db_clock::duration>(std::chrono::seconds(DEFAULT_BASE_TIME_SECONDS));
|
||||
|
||||
date_tiered_manifest(const std::map<sstring, sstring>& options)
|
||||
: _options(options)
|
||||
{
|
||||
// FIXME: implement option to disable tombstone compaction.
|
||||
#if 0
|
||||
if (!options.containsKey(AbstractCompactionStrategy.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(AbstractCompactionStrategy.TOMBSTONE_THRESHOLD_OPTION))
|
||||
@@ -119,8 +188,8 @@ public:
|
||||
for (auto& entry : *cf.get_sstables()) {
|
||||
sstables.push_back(entry);
|
||||
}
|
||||
auto candidates = filter_old_sstables(sstables, _max_sstable_age, now);
|
||||
auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _base_time, base, now);
|
||||
auto candidates = filter_old_sstables(sstables, _options.max_sstable_age, now);
|
||||
auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);
|
||||
|
||||
for (auto& bucket : buckets) {
|
||||
if (bucket.size() >= size_t(cf.schema()->min_compaction_threshold())) {
|
||||
@@ -161,11 +230,11 @@ private:
|
||||
get_compaction_candidates(column_family& cf, std::vector<sstables::shared_sstable> candidate_sstables, int64_t now, int base) {
|
||||
int min_threshold = cf.schema()->min_compaction_threshold();
|
||||
int max_threshold = cf.schema()->max_compaction_threshold();
|
||||
auto candidates = filter_old_sstables(candidate_sstables, _max_sstable_age, now);
|
||||
auto candidates = filter_old_sstables(candidate_sstables, _options.max_sstable_age, now);
|
||||
|
||||
auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _base_time, base, now);
|
||||
auto buckets = get_buckets(create_sst_and_min_timestamp_pairs(candidates), _options.base_time, base, now);
|
||||
|
||||
return newest_bucket(buckets, min_threshold, max_threshold, now, _base_time);
|
||||
return newest_bucket(buckets, min_threshold, max_threshold, now, _options.base_time);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -186,12 +255,11 @@ private:
|
||||
* @return a list of sstables with the oldest sstables excluded
|
||||
*/
|
||||
static std::vector<sstables::shared_sstable>
|
||||
filter_old_sstables(std::vector<sstables::shared_sstable> sstables, db_clock::duration max_sstable_age, int64_t now) {
|
||||
int64_t max_sstable_age_count = std::chrono::duration_cast<std::chrono::microseconds>(max_sstable_age).count();
|
||||
if (max_sstable_age_count == 0) {
|
||||
filter_old_sstables(std::vector<sstables::shared_sstable> sstables, api::timestamp_type max_sstable_age, int64_t now) {
|
||||
if (max_sstable_age == 0) {
|
||||
return sstables;
|
||||
}
|
||||
int64_t cutoff = now - max_sstable_age_count;
|
||||
int64_t cutoff = now - max_sstable_age;
|
||||
|
||||
sstables.erase(std::remove_if(sstables.begin(), sstables.end(), [cutoff] (auto& sst) {
|
||||
return sst->get_stats_metadata().max_timestamp < cutoff;
|
||||
@@ -275,14 +343,14 @@ private:
|
||||
* Each bucket is also a list of files ordered from newest to oldest.
|
||||
*/
|
||||
std::vector<std::vector<sstables::shared_sstable>>
|
||||
get_buckets(std::vector<std::pair<sstables::shared_sstable,int64_t>>&& files, db_clock::duration time_unit, int base, int64_t now) const {
|
||||
get_buckets(std::vector<std::pair<sstables::shared_sstable,int64_t>>&& files, api::timestamp_type time_unit, int base, int64_t now) const {
|
||||
// Sort files by age. Newest first.
|
||||
std::sort(files.begin(), files.end(), [] (auto& i, auto& j) {
|
||||
return i.second > j.second;
|
||||
});
|
||||
|
||||
std::vector<std::vector<sstables::shared_sstable>> buckets;
|
||||
auto target = get_initial_target(now, std::chrono::duration_cast<std::chrono::microseconds>(time_unit).count());
|
||||
auto target = get_initial_target(now, time_unit);
|
||||
auto it = files.begin();
|
||||
|
||||
while (it != files.end()) {
|
||||
@@ -329,12 +397,12 @@ private:
|
||||
*/
|
||||
std::vector<sstables::shared_sstable>
|
||||
newest_bucket(std::vector<std::vector<sstables::shared_sstable>>& buckets, int min_threshold, int max_threshold,
|
||||
int64_t now, db_clock::duration base_time) {
|
||||
int64_t now, api::timestamp_type base_time) {
|
||||
|
||||
// If the "incoming window" has at least minThreshold SSTables, choose that one.
|
||||
// For any other bucket, at least 2 SSTables is enough.
|
||||
// In any case, limit to maxThreshold SSTables.
|
||||
target incoming_window = get_initial_target(now, std::chrono::duration_cast<std::chrono::microseconds>(base_time).count());
|
||||
target incoming_window = get_initial_target(now, base_time);
|
||||
for (auto& bucket : buckets) {
|
||||
auto min_timestamp = bucket.front()->get_stats_metadata().min_timestamp;
|
||||
if (bucket.size() >= size_t(min_threshold) ||
|
||||
|
||||
@@ -155,10 +155,6 @@ struct column_stats {
|
||||
/** histogram of tombstone drop time */
|
||||
streaming_histogram tombstone_histogram;
|
||||
|
||||
/** max and min column names according to comparator */
|
||||
std::vector<bytes> min_column_names;
|
||||
std::vector<bytes> max_column_names;
|
||||
|
||||
bool has_legacy_counter_shards;
|
||||
|
||||
column_stats() :
|
||||
@@ -211,8 +207,8 @@ private:
|
||||
std::set<int> _ancestors;
|
||||
streaming_histogram _estimated_tombstone_drop_time{TOMBSTONE_HISTOGRAM_BIN_SIZE};
|
||||
int _sstable_level = 0;
|
||||
std::vector<bytes> _min_column_names;
|
||||
std::vector<bytes> _max_column_names;
|
||||
std::vector<bytes_opt> _min_column_names;
|
||||
std::vector<bytes_opt> _max_column_names;
|
||||
bool _has_legacy_counter_shards = false;
|
||||
|
||||
/**
|
||||
@@ -226,10 +222,14 @@ private:
|
||||
/*
|
||||
* Convert a vector of bytes into a disk array of disk_string<uint16_t>.
|
||||
*/
|
||||
static void convert(disk_array<uint32_t, disk_string<uint16_t>>&to, std::vector<bytes>&& from) {
|
||||
to.elements.resize(from.size());
|
||||
static void convert(disk_array<uint32_t, disk_string<uint16_t>>&to, std::vector<bytes_opt>&& from) {
|
||||
for (auto i = 0U; i < from.size(); i++) {
|
||||
to.elements[i].value = std::move(from[i]);
|
||||
if (!from[i]) {
|
||||
break;
|
||||
}
|
||||
disk_string<uint16_t> s;
|
||||
s.value = std::move(from[i].value());
|
||||
to.elements.push_back(std::move(s));
|
||||
}
|
||||
}
|
||||
public:
|
||||
@@ -286,31 +286,25 @@ public:
|
||||
_sstable_level = sstable_level;
|
||||
}
|
||||
|
||||
void update_min_column_names(std::vector<bytes>&& min_column_names) {
|
||||
if (min_column_names.size() > 0) {
|
||||
column_name_helper::merge_min_components(_min_column_names, std::move(min_column_names));
|
||||
}
|
||||
std::vector<bytes_opt>& min_column_names() {
|
||||
return _min_column_names;
|
||||
}
|
||||
|
||||
void update_max_column_names(std::vector<bytes>&& max_column_names) {
|
||||
if (max_column_names.size() > 0) {
|
||||
column_name_helper::merge_max_components(_max_column_names, std::move(max_column_names));
|
||||
}
|
||||
std::vector<bytes_opt>& max_column_names() {
|
||||
return _max_column_names;
|
||||
}
|
||||
|
||||
void update_has_legacy_counter_shards(bool has_legacy_counter_shards) {
|
||||
_has_legacy_counter_shards = _has_legacy_counter_shards || has_legacy_counter_shards;
|
||||
}
|
||||
|
||||
void update(column_stats&& stats) {
|
||||
void update(const schema& s, column_stats&& stats) {
|
||||
update_min_timestamp(stats.min_timestamp.get());
|
||||
update_max_timestamp(stats.max_timestamp.get());
|
||||
update_max_local_deletion_time(stats.max_local_deletion_time.get());
|
||||
add_row_size(stats.row_size);
|
||||
add_column_count(stats.column_count);
|
||||
merge_tombstone_histogram(stats.tombstone_histogram);
|
||||
update_min_column_names(std::move(stats.min_column_names));
|
||||
update_max_column_names(std::move(stats.max_column_names));
|
||||
update_has_legacy_counter_shards(stats.has_legacy_counter_shards);
|
||||
}
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "unimplemented.hh"
|
||||
#include "utils/move.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include <seastar/core/byteorder.hh>
|
||||
|
||||
namespace sstables {
|
||||
|
||||
@@ -107,10 +108,12 @@ private:
|
||||
key_view _key;
|
||||
const io_priority_class* _pc = nullptr;
|
||||
query::clustering_key_filtering_context _ck_filtering;
|
||||
query::clustering_key_filter _filter;
|
||||
bool _in_current_ck_range = false;
|
||||
query::clustering_row_ranges::const_iterator _current_ck_range;
|
||||
query::clustering_row_ranges::const_iterator _ck_range_end;
|
||||
|
||||
bool _skip_partition;
|
||||
bool _skip_clustering_row;
|
||||
bool _skip_partition = false;
|
||||
bool _skip_clustering_row = false;
|
||||
|
||||
// We don't have "end of clustering row" markers. So we know that the current
|
||||
// row has ended once we get something (e.g. a live cell) that belongs to another
|
||||
@@ -123,8 +126,9 @@ private:
|
||||
mutation_fragment_opt _ready;
|
||||
|
||||
stdx::optional<new_mutation> _mutation;
|
||||
bool _is_mutation_end;
|
||||
bool _is_mutation_end = false;
|
||||
|
||||
public:
|
||||
struct column {
|
||||
bool is_static;
|
||||
bytes_view col_name;
|
||||
@@ -134,6 +138,7 @@ private:
|
||||
bytes collection_extra_data;
|
||||
bytes cell;
|
||||
const column_definition *cdef;
|
||||
bool is_present;
|
||||
|
||||
static constexpr size_t static_size = 2;
|
||||
|
||||
@@ -156,36 +161,32 @@ private:
|
||||
throw malformed_sstable_exception(sprint("Found %d clustering elements in column name. Was not expecting that!", clustering.size()));
|
||||
}
|
||||
|
||||
bool is_present(api::timestamp_type timestamp) {
|
||||
return cdef && timestamp > cdef->dropped_at();
|
||||
static bool check_static(const schema& schema, bytes_view col) {
|
||||
return composite_view(col, schema.is_compound()).is_static();
|
||||
}
|
||||
|
||||
static bool check_static(bytes_view col) {
|
||||
static bytes static_row(static_size, 0xff);
|
||||
return col.compare(0, static_size, static_row) == 0;
|
||||
static bytes_view fix_static_name(const schema& schema, bytes_view col) {
|
||||
return fix_static_name(col, check_static(schema, col));
|
||||
}
|
||||
|
||||
static bytes_view fix_static_name(bytes_view col) {
|
||||
if (check_static(col)) {
|
||||
static bytes_view fix_static_name(bytes_view col, bool is_static) {
|
||||
if(is_static) {
|
||||
col.remove_prefix(static_size);
|
||||
}
|
||||
return col;
|
||||
}
|
||||
|
||||
std::vector<bytes> extract_clustering_key(const schema& schema) {
|
||||
if (!schema.is_compound()) {
|
||||
return { to_bytes(col_name) };
|
||||
} else {
|
||||
return composite_view(col_name).explode();
|
||||
}
|
||||
return composite_view(col_name, schema.is_compound()).explode();
|
||||
}
|
||||
column(const schema& schema, bytes_view col)
|
||||
: is_static(check_static(col))
|
||||
, col_name(fix_static_name(col))
|
||||
column(const schema& schema, bytes_view col, api::timestamp_type timestamp)
|
||||
: is_static(check_static(schema, col))
|
||||
, col_name(fix_static_name(col, is_static))
|
||||
, clustering(extract_clustering_key(schema))
|
||||
, collection_extra_data(is_collection(schema) ? pop_back(clustering) : bytes()) // collections are not supported with COMPACT STORAGE, so this is fine
|
||||
, cell(!schema.is_dense() ? pop_back(clustering) : (*(schema.regular_begin())).name()) // dense: cell name is not provided. It is the only regular column
|
||||
, cdef(schema.get_column_definition(cell))
|
||||
, is_present(cdef && timestamp > cdef->dropped_at())
|
||||
{
|
||||
|
||||
if (is_static) {
|
||||
@@ -195,9 +196,15 @@ private:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_present && is_static != cdef->is_static()) {
|
||||
throw malformed_sstable_exception(seastar::format("Mismatch between {} cell and {} column definition",
|
||||
is_static ? "static" : "non-static", cdef->is_static() ? "static" : "non-static"));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
// Notes for collection mutation:
|
||||
//
|
||||
// While we could in theory generate the mutation for the elements as they
|
||||
@@ -266,6 +273,45 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
// We rely on the fact that the first 'S' in SSTables stands for 'sorted'
|
||||
// and the clustering row keys are always in an ascending order.
|
||||
bool is_in_range(const clustering_key_prefix& ck) {
|
||||
// This is a wrong comparator to use here, but at the moment the correct
|
||||
// one has a very serious disadvantage of not existing (see #1446).
|
||||
clustering_key_prefix::prefix_equality_less_compare cmp(*_schema);
|
||||
|
||||
while (_current_ck_range != _ck_range_end) {
|
||||
if (!_in_current_ck_range && _current_ck_range->start()) {
|
||||
auto& start = *_current_ck_range->start();
|
||||
if ((start.is_inclusive() && cmp(ck, start.value())) || (!start.is_inclusive() && !cmp(start.value(), ck))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// All subsequent clustering keys are larger than the start of this
|
||||
// range so there is no need to check that again.
|
||||
_in_current_ck_range = true;
|
||||
|
||||
if (!_current_ck_range->end()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto& end = *_current_ck_range->end();
|
||||
if ((!end.is_inclusive() && cmp(ck, end.value())) || (end.is_inclusive() && !cmp(end.value(), ck))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
++_current_ck_range;
|
||||
_in_current_ck_range = false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void set_up_ck_ranges(const partition_key& pk) {
|
||||
auto& range = _ck_filtering.get_ranges(pk);
|
||||
_current_ck_range = range.begin();
|
||||
_ck_range_end = range.end();
|
||||
_in_current_ck_range = false;
|
||||
}
|
||||
public:
|
||||
mutation_opt mut;
|
||||
|
||||
@@ -277,8 +323,9 @@ public:
|
||||
, _key(key_view(key))
|
||||
, _pc(&pc)
|
||||
, _ck_filtering(ck_filtering)
|
||||
, _filter(_ck_filtering.get_filter_for_sorted(partition_key::from_exploded(*_schema, key.explode(*_schema))))
|
||||
{ }
|
||||
{
|
||||
set_up_ck_ranges(partition_key::from_exploded(*_schema, key.explode(*_schema)));
|
||||
}
|
||||
|
||||
mp_row_consumer(const key& key,
|
||||
const schema_ptr schema,
|
||||
@@ -305,7 +352,7 @@ public:
|
||||
_is_mutation_end = false;
|
||||
_skip_partition = false;
|
||||
_skip_clustering_row = false;
|
||||
_filter = _ck_filtering.get_filter_for_sorted(_mutation->key);
|
||||
set_up_ck_ranges(_mutation->key);
|
||||
return proceed::no;
|
||||
} else {
|
||||
throw malformed_sstable_exception(sprint("Key mismatch. Got %s while processing %s", to_hex(bytes_view(key)).c_str(), to_hex(bytes_view(_key)).c_str()));
|
||||
@@ -344,7 +391,7 @@ public:
|
||||
flush();
|
||||
}
|
||||
if (!_in_progress) {
|
||||
_skip_clustering_row = !is_static && !_filter(pos.key());
|
||||
_skip_clustering_row = !is_static && !is_in_range(pos.key());
|
||||
if (is_static) {
|
||||
_in_progress = mutation_fragment(static_row());
|
||||
} else {
|
||||
@@ -384,7 +431,7 @@ public:
|
||||
return proceed::yes;
|
||||
}
|
||||
|
||||
struct column col(*_schema, col_name);
|
||||
struct column col(*_schema, col_name, timestamp);
|
||||
|
||||
auto clustering_prefix = exploded_clustering_prefix(std::move(col.clustering));
|
||||
auto ret = flush_if_needed(col.is_static, clustering_prefix);
|
||||
@@ -398,7 +445,7 @@ public:
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!col.is_present(timestamp)) {
|
||||
if (!col.is_present) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -426,10 +473,11 @@ public:
|
||||
return proceed::yes;
|
||||
}
|
||||
|
||||
struct column col(*_schema, col_name);
|
||||
auto timestamp = deltime.marked_for_delete_at;
|
||||
struct column col(*_schema, col_name, timestamp);
|
||||
gc_clock::duration secs(deltime.local_deletion_time);
|
||||
|
||||
return consume_deleted_cell(col, deltime.marked_for_delete_at, gc_clock::time_point(secs));
|
||||
return consume_deleted_cell(col, timestamp, gc_clock::time_point(secs));
|
||||
}
|
||||
|
||||
proceed consume_deleted_cell(column &col, int64_t timestamp, gc_clock::time_point ttl) {
|
||||
@@ -444,7 +492,7 @@ public:
|
||||
_in_progress->as_clustering_row().apply(rm);
|
||||
return ret;
|
||||
}
|
||||
if (!col.is_present(timestamp)) {
|
||||
if (!col.is_present) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -510,7 +558,7 @@ public:
|
||||
return proceed::yes;
|
||||
}
|
||||
|
||||
auto start = composite_view(column::fix_static_name(start_col)).explode();
|
||||
auto start = composite_view(column::fix_static_name(*_schema, start_col)).explode();
|
||||
|
||||
// Note how this is slightly different from the check in is_collection. Collection tombstones
|
||||
// do not have extra data.
|
||||
@@ -520,7 +568,7 @@ public:
|
||||
if (start.size() <= _schema->clustering_key_size()) {
|
||||
auto start_ck = clustering_key_prefix::from_exploded(std::move(start));
|
||||
auto start_kind = start_marker_to_bound_kind(start_col);
|
||||
auto end = clustering_key_prefix::from_exploded(composite_view(column::fix_static_name(end_col)).explode());
|
||||
auto end = clustering_key_prefix::from_exploded(composite_view(column::fix_static_name(*_schema, end_col)).explode());
|
||||
auto end_kind = end_marker_to_bound_kind(end_col);
|
||||
if (range_tombstone::is_single_clustering_row_tombstone(*_schema, start_ck, start_kind, end, end_kind)) {
|
||||
auto ret = flush_if_needed(std::move(start_ck));
|
||||
@@ -555,8 +603,8 @@ public:
|
||||
return *_pc;
|
||||
}
|
||||
|
||||
bool is_mutation_end() const {
|
||||
return _is_mutation_end;
|
||||
bool get_and_reset_is_mutation_end() {
|
||||
return std::exchange(_is_mutation_end, false);
|
||||
}
|
||||
|
||||
stdx::optional<new_mutation> get_mutation() {
|
||||
@@ -576,47 +624,96 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct sstable_data_source {
|
||||
shared_sstable _sst;
|
||||
mp_row_consumer _consumer;
|
||||
data_consume_context _context;
|
||||
|
||||
sstable_data_source(shared_sstable sst, mp_row_consumer&& consumer)
|
||||
: _sst(std::move(sst))
|
||||
, _consumer(std::move(consumer))
|
||||
, _context(_sst->data_consume_rows(_consumer))
|
||||
{ }
|
||||
|
||||
sstable_data_source(shared_sstable sst, mp_row_consumer&& consumer, sstable::disk_read_range toread)
|
||||
: _sst(std::move(sst))
|
||||
, _consumer(std::move(consumer))
|
||||
, _context(_sst->data_consume_rows(_consumer, std::move(toread)))
|
||||
{ }
|
||||
|
||||
sstable_data_source(schema_ptr s, shared_sstable sst, const sstables::key& k, const io_priority_class& pc,
|
||||
query::clustering_key_filtering_context ck_filtering, sstable::disk_read_range toread)
|
||||
: _sst(std::move(sst))
|
||||
, _consumer(k, s, ck_filtering, pc)
|
||||
, _context(_sst->data_consume_rows(_consumer, std::move(toread)))
|
||||
{ }
|
||||
};
|
||||
|
||||
class sstable_streamed_mutation : public streamed_mutation::impl {
|
||||
data_consume_context& _context;
|
||||
mp_row_consumer& _consumer;
|
||||
lw_shared_ptr<sstable_data_source> _ds;
|
||||
tombstone _t;
|
||||
bool _finished = false;
|
||||
range_tombstone_stream _range_tombstones;
|
||||
mutation_fragment_opt _current_candidate;
|
||||
mutation_fragment_opt _next_candidate;
|
||||
stdx::optional<position_in_partition> _last_position;
|
||||
position_in_partition::less_compare _cmp;
|
||||
position_in_partition::equal_compare _eq;
|
||||
private:
|
||||
future<mutation_fragment_opt> read_next() {
|
||||
future<stdx::optional<mutation_fragment_opt>> read_next() {
|
||||
// Because of #1203 we may encounter sstables with range tombstones
|
||||
// placed earler than expected.
|
||||
if (_next_candidate) {
|
||||
auto mf = _range_tombstones.get_next(*_next_candidate);
|
||||
if (_next_candidate || (_current_candidate && _finished)) {
|
||||
assert(_current_candidate);
|
||||
auto mf = _range_tombstones.get_next(*_current_candidate);
|
||||
if (!mf) {
|
||||
mf = move_and_disengage(_next_candidate);
|
||||
mf = move_and_disengage(_current_candidate);
|
||||
_current_candidate = move_and_disengage(_next_candidate);
|
||||
}
|
||||
return make_ready_future<mutation_fragment_opt>(std::move(mf));
|
||||
return make_ready_future<stdx::optional<mutation_fragment_opt>>(std::move(mf));
|
||||
}
|
||||
if (_finished) {
|
||||
return make_ready_future<mutation_fragment_opt>(_range_tombstones.get_next());
|
||||
// No need to update _last_position here. We've already read everything from the sstable.
|
||||
return make_ready_future<stdx::optional<mutation_fragment_opt>>(_range_tombstones.get_next());
|
||||
}
|
||||
return _context.read().then([this] {
|
||||
if (_consumer.is_mutation_end()) {
|
||||
_finished = true;
|
||||
return _ds->_context.read().then([this] {
|
||||
_finished = _ds->_consumer.get_and_reset_is_mutation_end();
|
||||
auto mf = _ds->_consumer.get_mutation_fragment();
|
||||
if (mf) {
|
||||
if (mf->is_range_tombstone()) {
|
||||
// If sstable uses promoted index it will repeat relevant range tombstones in
|
||||
// each block. Do not emit these duplicates as they will break the guarantee
|
||||
// that mutation fragment are produced in ascending order.
|
||||
if (!_last_position || !_cmp(*mf, *_last_position)) {
|
||||
_last_position = mf->position();
|
||||
_range_tombstones.apply(std::move(mf->as_range_tombstone()));
|
||||
}
|
||||
} else {
|
||||
// mp_row_consumer may produce mutation_fragments in parts if they are
|
||||
// interrupted by range tombstone duplicate. Make sure they are merged
|
||||
// before emitting them.
|
||||
_last_position = mf->position();
|
||||
if (!_current_candidate) {
|
||||
_current_candidate = std::move(mf);
|
||||
} else if (_current_candidate && _eq(*_current_candidate, *mf)) {
|
||||
_current_candidate->apply(*_schema, std::move(*mf));
|
||||
} else {
|
||||
_next_candidate = std::move(mf);
|
||||
}
|
||||
}
|
||||
}
|
||||
auto mf = _consumer.get_mutation_fragment();
|
||||
if (mf && mf->is_range_tombstone()) {
|
||||
_range_tombstones.apply(std::move(mf->as_range_tombstone()));
|
||||
} else {
|
||||
_next_candidate = std::move(mf);
|
||||
}
|
||||
return read_next();
|
||||
return stdx::optional<mutation_fragment_opt>();
|
||||
});
|
||||
}
|
||||
public:
|
||||
sstable_streamed_mutation(schema_ptr s, dht::decorated_key dk, data_consume_context& context, mp_row_consumer& consumer, tombstone t)
|
||||
: streamed_mutation::impl(s, std::move(dk), t), _context(context), _consumer(consumer), _t(t), _range_tombstones(*s) { }
|
||||
sstable_streamed_mutation(schema_ptr s, dht::decorated_key dk, tombstone t, lw_shared_ptr<sstable_data_source> ds)
|
||||
: streamed_mutation::impl(s, std::move(dk), t), _ds(std::move(ds)), _t(t), _range_tombstones(*s), _cmp(*s), _eq(*s) { }
|
||||
|
||||
virtual future<> fill_buffer() final override {
|
||||
return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
|
||||
return read_next().then([this] (mutation_fragment_opt&& mfopt) {
|
||||
return repeat_until_value([this] {
|
||||
return read_next();
|
||||
}).then([this] (mutation_fragment_opt&& mfopt) {
|
||||
if (!mfopt) {
|
||||
_end_of_stream = true;
|
||||
} else {
|
||||
@@ -625,38 +722,17 @@ public:
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
class sstable_single_streamed_mutation final : public sstable_streamed_mutation {
|
||||
struct data_source {
|
||||
mp_row_consumer _consumer;
|
||||
data_consume_context _context;
|
||||
|
||||
data_source(schema_ptr s, sstable& sst, const sstables::key& k, const io_priority_class& pc,
|
||||
query::clustering_key_filtering_context ck_filtering, uint64_t start, uint64_t end)
|
||||
: _consumer(k, s, ck_filtering, pc)
|
||||
, _context(sst.data_consume_rows(_consumer, start, end))
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
lw_shared_ptr<data_source> _data_source;
|
||||
public:
|
||||
sstable_single_streamed_mutation(schema_ptr s, dht::decorated_key dk, tombstone t, lw_shared_ptr<data_source> ds)
|
||||
: sstable_streamed_mutation(std::move(s), std::move(dk), ds->_context, ds->_consumer, t)
|
||||
, _data_source(ds)
|
||||
{ }
|
||||
|
||||
static future<streamed_mutation> create(schema_ptr s, sstable& sst, const sstables::key& k,
|
||||
static future<streamed_mutation> create(schema_ptr s, shared_sstable sst, const sstables::key& k,
|
||||
query::clustering_key_filtering_context ck_filtering,
|
||||
const io_priority_class& pc, uint64_t start, uint64_t end)
|
||||
const io_priority_class& pc, sstable::disk_read_range toread)
|
||||
{
|
||||
auto ds = make_lw_shared<data_source>(s, sst, k, pc, ck_filtering, start, end);
|
||||
auto ds = make_lw_shared<sstable_data_source>(s, sst, k, pc, ck_filtering, std::move(toread));
|
||||
return ds->_context.read().then([s, ds] {
|
||||
auto mut = ds->_consumer.get_mutation();
|
||||
assert(mut);
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, std::move(mut->key));
|
||||
return make_streamed_mutation<sstable_single_streamed_mutation>(s, std::move(dk), mut->tomb, ds);
|
||||
return make_streamed_mutation<sstable_streamed_mutation>(s, std::move(dk), mut->tomb, ds);
|
||||
});
|
||||
}
|
||||
};
|
||||
@@ -704,37 +780,198 @@ sstables::sstable::read_row(schema_ptr schema,
|
||||
if (!filter_has_key(key)) {
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
}
|
||||
return find_disk_ranges(schema, key, ck_filtering, pc).then([this, &key, ck_filtering, &pc, schema] (disk_read_range toread) {
|
||||
if (!toread.found_row()) {
|
||||
_filter_tracker.add_false_positive();
|
||||
}
|
||||
if (!toread) {
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
}
|
||||
_filter_tracker.add_true_positive();
|
||||
return sstable_streamed_mutation::create(schema, this->shared_from_this(), key, ck_filtering, pc, std::move(toread)).then([] (auto sm) {
|
||||
return streamed_mutation_opt(std::move(sm));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline T read_be(const signed char* p) {
|
||||
return ::read_be<T>(reinterpret_cast<const char*>(p));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static inline T consume_be(bytes_view& p) {
|
||||
T i = read_be<T>(p.data());
|
||||
p.remove_prefix(sizeof(T));
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline bytes_view consume_bytes(bytes_view& p, size_t len) {
|
||||
auto ret = bytes_view(p.data(), len);
|
||||
p.remove_prefix(len);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline clustering_key_prefix get_clustering_key(
|
||||
const schema& schema, bytes_view col_name) {
|
||||
mp_row_consumer::column col(schema, std::move(col_name), api::max_timestamp);
|
||||
return std::move(col.clustering);
|
||||
}
|
||||
|
||||
static bool has_static_columns(const schema& schema, index_entry &ie) {
|
||||
// We can easily check if there are any static columns in this partition,
|
||||
// because the static columns always come first, so the first promoted
|
||||
// index block will start with one, if there are any. The name of a static
|
||||
// column is a composite beginning with a special marker (0xffff).
|
||||
// But we can only assume the column name is composite if the schema is
|
||||
// compound - if it isn't, we cannot have any static columns anyway.
|
||||
//
|
||||
// The first 18 bytes are deletion times (4+8), num blocks (4), and
|
||||
// length of start column (2). Then come the actual column name bytes.
|
||||
// See also composite::is_static().
|
||||
auto data = ie.get_promoted_index_bytes();
|
||||
return schema.is_compound() && data.size() >= 20 && data[18] == -1 && data[19] == -1;
|
||||
}
|
||||
|
||||
future<sstable::disk_read_range>
|
||||
sstables::sstable::find_disk_ranges(
|
||||
schema_ptr schema, const sstables::key& key,
|
||||
query::clustering_key_filtering_context ck_filtering,
|
||||
const io_priority_class& pc) {
|
||||
auto& partitioner = dht::global_partitioner();
|
||||
auto token = partitioner.get_token(key_view(key));
|
||||
|
||||
auto& summary = _summary;
|
||||
|
||||
if (token < partitioner.get_token(key_view(summary.first_key.value))
|
||||
|| token > partitioner.get_token(key_view(summary.last_key.value))) {
|
||||
_filter_tracker.add_false_positive();
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
if (token < partitioner.get_token(key_view(_summary.first_key.value))
|
||||
|| token > partitioner.get_token(key_view(_summary.last_key.value))) {
|
||||
return make_ready_future<disk_read_range>();
|
||||
}
|
||||
|
||||
auto summary_idx = adjust_binary_search_index(binary_search(summary.entries, key, token));
|
||||
auto summary_idx = adjust_binary_search_index(binary_search(_summary.entries, key, token));
|
||||
if (summary_idx < 0) {
|
||||
_filter_tracker.add_false_positive();
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
return make_ready_future<disk_read_range>();
|
||||
}
|
||||
|
||||
return read_indexes(summary_idx, pc).then([this, schema, ck_filtering, &key, token, summary_idx, &pc] (auto index_list) {
|
||||
auto index_idx = this->binary_search(index_list, key, token);
|
||||
if (index_idx < 0) {
|
||||
_filter_tracker.add_false_positive();
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
return make_ready_future<disk_read_range>();
|
||||
}
|
||||
_filter_tracker.add_true_positive();
|
||||
index_entry& ie = index_list[index_idx];
|
||||
if (ie.get_promoted_index_bytes().size() >= 16) {
|
||||
auto&& pkey = partition_key::from_exploded(*schema, key.explode(*schema));
|
||||
auto& ck_ranges = ck_filtering.get_ranges(pkey);
|
||||
if (ck_ranges.size() == 1 && ck_ranges[0].is_full()) {
|
||||
// When no clustering filter is given to sstable::read_row(),
|
||||
// we get here one range unbounded on both sides. This is fine
|
||||
// (the code below will work with an unbounded range), but
|
||||
// let's drop this range to revert to the classic behavior of
|
||||
// reading entire sstable row without using the promoted index
|
||||
} else if (ck_filtering.want_static_columns(pkey) && has_static_columns(*schema, ie)) {
|
||||
// FIXME: If we need to read the static columns and also a
|
||||
// non-full clustering key range, we need to return two byte
|
||||
// ranges in the returned disk_read_range. We don't support
|
||||
// this yet so for now let's fall back to reading the entire
|
||||
// partition which is wasteful but at least correct.
|
||||
// This case should be replaced by correctly adding the static
|
||||
// column's blocks to the return.
|
||||
} else if (ck_ranges.size() == 1) {
|
||||
auto data = ie.get_promoted_index_bytes();
|
||||
// note we already verified above that data.size >= 16
|
||||
sstables::deletion_time deltime;
|
||||
deltime.local_deletion_time = consume_be<uint32_t>(data);
|
||||
deltime.marked_for_delete_at = consume_be<uint64_t>(data);
|
||||
uint32_t num_blocks = consume_be<uint32_t>(data);
|
||||
// We do a linear search on the promoted index. If we were to
|
||||
// look in the same promoted index several times it might have
|
||||
// made sense to build an array of key starts so we can do a
|
||||
// binary search. We could do this once we have a key cache.
|
||||
auto& range_start = ck_ranges[0].start();
|
||||
bool found_range_start = false;
|
||||
uint64_t range_start_pos;
|
||||
auto& range_end = ck_ranges[0].end();
|
||||
|
||||
auto position = index_list[index_idx].position();
|
||||
return this->data_end_position(summary_idx, index_idx, index_list, pc).then([&key, schema, ck_filtering, this, position, &pc] (uint64_t end) {
|
||||
return sstable_single_streamed_mutation::create(schema, *this, key, ck_filtering, pc, position, end).then([] (auto sm) {
|
||||
return streamed_mutation_opt(std::move(sm));
|
||||
});
|
||||
auto cmp = clustering_key_prefix::tri_compare(*schema);
|
||||
while (num_blocks--) {
|
||||
if (data.size() < 2) {
|
||||
// When we break out of this loop, we give up on
|
||||
// using the promoted index, and fall back to
|
||||
// reading the entire partition.
|
||||
// FIXME: this and all other "break" cases below,
|
||||
// are errors. Log them (with rate limit) and count.
|
||||
break;
|
||||
}
|
||||
uint16_t len = consume_be<uint16_t>(data);
|
||||
if (data.size() < len) {
|
||||
break;
|
||||
}
|
||||
// The promoted index contains ranges of full column
|
||||
// names, which may include a clustering key and column.
|
||||
// But we only need to match the clustering key, because
|
||||
// we got a clustering key range to search for.
|
||||
auto start_ck = get_clustering_key(*schema,
|
||||
consume_bytes(data, len));
|
||||
if (data.size() < 2) {
|
||||
break;
|
||||
}
|
||||
len = consume_be<uint16_t>(data);
|
||||
if (data.size() < len) {
|
||||
break;
|
||||
}
|
||||
auto end_ck = get_clustering_key(*schema,
|
||||
consume_bytes(data, len));
|
||||
if (data.size() < 16) {
|
||||
break;
|
||||
}
|
||||
uint64_t offset = consume_be<uint64_t>(data);
|
||||
uint64_t width = consume_be<uint64_t>(data);
|
||||
if (!found_range_start) {
|
||||
if (!range_start || cmp(range_start->value(), end_ck) <= 0) {
|
||||
range_start_pos = ie.position() + offset;
|
||||
found_range_start = true;
|
||||
}
|
||||
}
|
||||
bool found_range_end = false;
|
||||
uint64_t range_end_pos;
|
||||
if (range_end) {
|
||||
if (cmp(range_end->value(), start_ck) < 0) {
|
||||
// this block is already past the range_end
|
||||
found_range_end = true;
|
||||
range_end_pos = ie.position() + offset;
|
||||
} else if (cmp(range_end->value(), end_ck) < 0 || num_blocks == 0) {
|
||||
// range_end is in the middle of this block.
|
||||
// Note the strict inequality above is important:
|
||||
// if range_end==end_ck the next block may contain
|
||||
// still more items matching range_end.
|
||||
found_range_end = true;
|
||||
range_end_pos = ie.position() + offset + width;
|
||||
}
|
||||
} else if (num_blocks == 0) {
|
||||
// When !range_end, read until the last block.
|
||||
// In this case we could have also found the end of
|
||||
// the partition using the index.
|
||||
found_range_end = true;
|
||||
range_end_pos = ie.position() + offset + width;
|
||||
}
|
||||
if (found_range_end) {
|
||||
if (!found_range_start) {
|
||||
// return empty range
|
||||
range_start_pos = range_end_pos = 0;
|
||||
}
|
||||
return make_ready_future<disk_read_range>(
|
||||
disk_read_range(range_start_pos, range_end_pos,
|
||||
key, deltime));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Else, if more than one clustering-key range needs to be read,
|
||||
// fall back to reading the entire partition.
|
||||
// FIXME: support multiple ranges, and do not fall back to reading
|
||||
// the entire partition.
|
||||
}
|
||||
// If we're still here there is no promoted index, or we had problems
|
||||
// using it, so just just find the entire partition's range.
|
||||
auto start = ie.position();
|
||||
return this->data_end_position(summary_idx, index_idx, index_list, pc).then([start] (uint64_t end) {
|
||||
return disk_read_range(start, end);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -742,25 +979,31 @@ sstables::sstable::read_row(schema_ptr schema,
|
||||
class mutation_reader::impl {
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
lw_shared_ptr<sstable_data_source> _ds;
|
||||
// For some reason std::function requires functors to be copyable and that's
|
||||
// why we cannot store mp_row_consumer in _get_data_source captured values.
|
||||
// Instead we have this _consumer field here which is moved away by
|
||||
// _get_data_source().
|
||||
mp_row_consumer _consumer;
|
||||
std::experimental::optional<data_consume_context> _context;
|
||||
std::function<future<data_consume_context> ()> _get_context;
|
||||
std::function<future<lw_shared_ptr<sstable_data_source>> ()> _get_data_source;
|
||||
public:
|
||||
impl(sstable& sst, schema_ptr schema, uint64_t start, uint64_t end,
|
||||
impl(shared_sstable sst, schema_ptr schema, sstable::disk_read_range toread,
|
||||
const io_priority_class &pc)
|
||||
: _schema(schema)
|
||||
, _consumer(schema, query::no_clustering_key_filtering, pc)
|
||||
, _get_context([&sst, this, start, end] {
|
||||
return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
|
||||
, _get_data_source([this, sst = std::move(sst), toread] {
|
||||
auto ds = make_lw_shared<sstable_data_source>(std::move(sst), std::move(_consumer), std::move(toread));
|
||||
return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
|
||||
}) { }
|
||||
impl(sstable& sst, schema_ptr schema,
|
||||
impl(shared_sstable sst, schema_ptr schema,
|
||||
const io_priority_class &pc)
|
||||
: _schema(schema)
|
||||
, _consumer(schema, query::no_clustering_key_filtering, pc)
|
||||
, _get_context([this, &sst] {
|
||||
return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer));
|
||||
, _get_data_source([this, sst = std::move(sst)] {
|
||||
auto ds = make_lw_shared<sstable_data_source>(std::move(sst), std::move(_consumer));
|
||||
return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
|
||||
}) { }
|
||||
impl(sstable& sst,
|
||||
impl(shared_sstable sst,
|
||||
schema_ptr schema,
|
||||
std::function<future<uint64_t>()> start,
|
||||
std::function<future<uint64_t>()> end,
|
||||
@@ -768,48 +1011,49 @@ public:
|
||||
const io_priority_class& pc)
|
||||
: _schema(schema)
|
||||
, _consumer(schema, ck_filtering, pc)
|
||||
, _get_context([this, &sst, start = std::move(start), end = std::move(end)] () {
|
||||
return start().then([this, &sst, end = std::move(end)] (uint64_t start) {
|
||||
return end().then([this, &sst, start] (uint64_t end) {
|
||||
return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
|
||||
, _get_data_source([this, sst = std::move(sst), start = std::move(start), end = std::move(end)] () mutable {
|
||||
return start().then([this, sst = std::move(sst), end = std::move(end)] (uint64_t start) mutable {
|
||||
return end().then([this, sst = std::move(sst), start] (uint64_t end) mutable {
|
||||
return make_lw_shared<sstable_data_source>(std::move(sst), std::move(_consumer), sstable::disk_read_range{start, end});
|
||||
});
|
||||
});
|
||||
}) { }
|
||||
impl() : _consumer(), _get_context() { }
|
||||
impl() : _get_data_source() { }
|
||||
|
||||
// Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy
|
||||
impl(impl&&) = delete;
|
||||
impl(const impl&) = delete;
|
||||
|
||||
future<streamed_mutation_opt> read() {
|
||||
if (!_get_context) {
|
||||
if (!_get_data_source) {
|
||||
// empty mutation reader returns EOF immediately
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
}
|
||||
|
||||
if (_context) {
|
||||
if (_ds) {
|
||||
return do_read();
|
||||
}
|
||||
return (_get_context)().then([this] (data_consume_context context) {
|
||||
_context = std::move(context);
|
||||
return (_get_data_source)().then([this] (lw_shared_ptr<sstable_data_source> ds) {
|
||||
_ds = std::move(ds);
|
||||
return do_read();
|
||||
});
|
||||
}
|
||||
private:
|
||||
future<streamed_mutation_opt> do_read() {
|
||||
return _context->read().then([this] {
|
||||
auto mut = _consumer.get_mutation();
|
||||
return _ds->_context.read().then([this] {
|
||||
auto& consumer = _ds->_consumer;
|
||||
auto mut = consumer.get_mutation();
|
||||
if (!mut) {
|
||||
if (_consumer.get_mutation_fragment()) {
|
||||
if (consumer.get_mutation_fragment() || consumer.get_and_reset_is_mutation_end()) {
|
||||
// We are still in the middle of the previous mutation.
|
||||
_consumer.skip_partition();
|
||||
consumer.skip_partition();
|
||||
return do_read();
|
||||
} else {
|
||||
return make_ready_future<streamed_mutation_opt>();
|
||||
}
|
||||
}
|
||||
auto dk = dht::global_partitioner().decorate_key(*_schema, std::move(mut->key));
|
||||
auto sm = make_streamed_mutation<sstable_streamed_mutation>(_schema, std::move(dk), *_context, _consumer, mut->tomb);
|
||||
auto sm = make_streamed_mutation<sstable_streamed_mutation>(_schema, std::move(dk), mut->tomb, _ds);
|
||||
return make_ready_future<streamed_mutation_opt>(std::move(sm));
|
||||
});
|
||||
}
|
||||
@@ -825,7 +1069,7 @@ future<streamed_mutation_opt> mutation_reader::read() {
|
||||
}
|
||||
|
||||
mutation_reader sstable::read_rows(schema_ptr schema, const io_priority_class& pc) {
|
||||
return std::make_unique<mutation_reader::impl>(*this, schema, pc);
|
||||
return std::make_unique<mutation_reader::impl>(shared_from_this(), schema, pc);
|
||||
}
|
||||
|
||||
// Less-comparator for lookups in the partition index.
|
||||
@@ -938,7 +1182,7 @@ sstable::read_range_rows(schema_ptr schema,
|
||||
};
|
||||
|
||||
return std::make_unique<mutation_reader::impl>(
|
||||
*this, std::move(schema), std::move(start), std::move(end), ck_filtering, pc);
|
||||
shared_from_this(), std::move(schema), std::move(start), std::move(end), ck_filtering, pc);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ private:
|
||||
RANGE_TOMBSTONE_3,
|
||||
RANGE_TOMBSTONE_4,
|
||||
RANGE_TOMBSTONE_5,
|
||||
STOP_THEN_ATOM_START,
|
||||
} _state = state::ROW_START;
|
||||
|
||||
row_consumer& _consumer;
|
||||
@@ -62,6 +63,7 @@ private:
|
||||
bool _deleted;
|
||||
uint32_t _ttl, _expiration;
|
||||
|
||||
bool _read_partial_row = false;
|
||||
|
||||
public:
|
||||
bool non_consuming() const {
|
||||
@@ -69,6 +71,7 @@ public:
|
||||
|| (_state == state::CELL_VALUE_BYTES_2)
|
||||
|| (_state == state::ATOM_START_2)
|
||||
|| (_state == state::ATOM_MASK_2)
|
||||
|| (_state == state::STOP_THEN_ATOM_START)
|
||||
|| (_state == state::EXPIRING_CELL_3)) && (_prestate == prestate::NONE));
|
||||
}
|
||||
|
||||
@@ -319,6 +322,9 @@ public:
|
||||
}
|
||||
break;
|
||||
}
|
||||
case state::STOP_THEN_ATOM_START:
|
||||
_state = state::ATOM_START;
|
||||
return row_consumer::proceed::no;
|
||||
default:
|
||||
throw malformed_sstable_exception("unknown state");
|
||||
}
|
||||
@@ -327,12 +333,42 @@ public:
|
||||
}
|
||||
|
||||
data_consume_rows_context(row_consumer& consumer,
|
||||
input_stream<char> && input, uint64_t maxlen) :
|
||||
continuous_data_consumer(std::move(input), maxlen)
|
||||
, _consumer(consumer) {
|
||||
input_stream<char> && input, uint64_t maxlen,
|
||||
std::experimental::optional<sstable::disk_read_range::row_info> ri = {})
|
||||
: continuous_data_consumer(std::move(input), maxlen)
|
||||
, _consumer(consumer) {
|
||||
// If the "ri" option is given, we are reading a partition from the
|
||||
// middle (in the beginning of an atom), as would happen when we use
|
||||
// the "promoted index" to skip closer to where a particular column
|
||||
// starts. When we start in the middle of the partition, we will not
|
||||
// read the key nor the tombstone from the disk, so the caller needs
|
||||
// to provide them (the tombstone is provided in the promoted index
|
||||
// exactly for that reason).
|
||||
if (ri) {
|
||||
_read_partial_row = true;
|
||||
auto ret = _consumer.consume_row_start(ri->k, ri->deltime);
|
||||
if (ret == row_consumer::proceed::yes) {
|
||||
_state = state::ATOM_START;
|
||||
} else {
|
||||
// If we were asked to stop parsing after consuming the row
|
||||
// start, we can't go to ATOM_START, need to use a new state
|
||||
// which stops parsing, and continues at ATOM_START later.
|
||||
_state = state::STOP_THEN_ATOM_START;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void verify_end_state() {
|
||||
if (_read_partial_row) {
|
||||
// If reading a partial row (i.e., when we have a clustering row
|
||||
// filter and using a promoted index), we may be in ATOM_START
|
||||
// state instead of ROW_START. In that case we did not read the
|
||||
// end-of-row marker and consume_row_end() was never called.
|
||||
if (_state == state::ATOM_START) {
|
||||
_consumer.consume_row_end();
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (_state != state::ROW_START || _prestate != prestate::NONE) {
|
||||
throw malformed_sstable_exception("end of input, but not end of row");
|
||||
}
|
||||
@@ -346,15 +382,18 @@ public:
|
||||
// memory in the same time (they are delivered to the consumer one by one).
|
||||
class data_consume_context::impl {
|
||||
private:
|
||||
shared_sstable _sst;
|
||||
std::unique_ptr<data_consume_rows_context> _ctx;
|
||||
public:
|
||||
impl(row_consumer& consumer,
|
||||
input_stream<char>&& input, uint64_t maxlen) :
|
||||
_ctx(new data_consume_rows_context(consumer, std::move(input), maxlen)) { }
|
||||
impl(shared_sstable sst, row_consumer& consumer, input_stream<char>&& input, uint64_t maxlen,
|
||||
std::experimental::optional<sstable::disk_read_range::row_info> ri)
|
||||
: _sst(std::move(sst))
|
||||
, _ctx(new data_consume_rows_context(consumer, std::move(input), maxlen, ri))
|
||||
{ }
|
||||
~impl() {
|
||||
if (_ctx) {
|
||||
auto f = _ctx->close();
|
||||
f.handle_exception([ctx = std::move(_ctx)] (auto) { });
|
||||
f.handle_exception([ctx = std::move(_ctx), sst = std::move(_sst)] (auto) { });
|
||||
}
|
||||
}
|
||||
future<> read() {
|
||||
@@ -376,18 +415,19 @@ future<> data_consume_context::read() {
|
||||
}
|
||||
|
||||
data_consume_context sstable::data_consume_rows(
|
||||
row_consumer& consumer, uint64_t start, uint64_t end) {
|
||||
row_consumer& consumer, sstable::disk_read_range toread) {
|
||||
// TODO: The second "end - start" below is redundant: The first one tells
|
||||
// data_stream() to stop at the "end" byte, which allows optimal read-
|
||||
// ahead and avoiding over-read at the end. The second one tells the
|
||||
// consumer to stop at exactly the same place, and forces the consumer
|
||||
// to maintain its own byte count.
|
||||
return std::make_unique<data_consume_context::impl>(
|
||||
consumer, data_stream(start, end - start, consumer.io_priority()), end - start);
|
||||
return std::make_unique<data_consume_context::impl>(shared_from_this(),
|
||||
consumer, data_stream(toread.start, toread.end - toread.start,
|
||||
consumer.io_priority()), toread.end - toread.start, toread.ri);
|
||||
}
|
||||
|
||||
data_consume_context sstable::data_consume_rows(row_consumer& consumer) {
|
||||
return data_consume_rows(consumer, 0, data_size());
|
||||
return data_consume_rows(consumer, {0, data_size()});
|
||||
}
|
||||
|
||||
future<> sstable::data_consume_rows_at_once(row_consumer& consumer,
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "core/do_with.hh"
|
||||
#include "core/thread.hh"
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <seastar/core/byteorder.hh>
|
||||
#include <iterator>
|
||||
|
||||
#include "types.hh"
|
||||
@@ -56,6 +57,7 @@
|
||||
|
||||
#include "checked-file-impl.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
#include "service/storage_service.hh"
|
||||
|
||||
thread_local disk_error_signal_type sstable_read_error;
|
||||
thread_local disk_error_signal_type sstable_write_error;
|
||||
@@ -296,6 +298,12 @@ inline void write(file_writer& out, bytes_view s) {
|
||||
out.write(reinterpret_cast<const char*>(s.data()), s.size()).get();
|
||||
}
|
||||
|
||||
inline void write(file_writer& out, bytes_ostream s) {
|
||||
for (bytes_view fragment : s) {
|
||||
write(out, fragment);
|
||||
}
|
||||
}
|
||||
|
||||
// All composite parsers must come after this
|
||||
template<typename First, typename... Rest>
|
||||
future<> parse(random_access_reader& in, First& first, Rest&&... rest) {
|
||||
@@ -1066,14 +1074,111 @@ future<> sstable::load() {
|
||||
});
|
||||
}
|
||||
|
||||
static void output_promoted_index_entry(bytes_ostream& promoted_index,
|
||||
const bytes& first_col,
|
||||
const bytes& last_col,
|
||||
uint64_t offset, uint64_t width) {
|
||||
char s[2];
|
||||
write_be(s, uint16_t(first_col.size()));
|
||||
promoted_index.write(s, 2);
|
||||
promoted_index.write(first_col);
|
||||
write_be(s, uint16_t(last_col.size()));
|
||||
promoted_index.write(s, 2);
|
||||
promoted_index.write(last_col);
|
||||
char q[8];
|
||||
write_be(q, uint64_t(offset));
|
||||
promoted_index.write(q, 8);
|
||||
write_be(q, uint64_t(width));
|
||||
promoted_index.write(q, 8);
|
||||
}
|
||||
|
||||
// FIXME: use this in write_column_name() instead of repeating the code
|
||||
static bytes serialize_colname(const composite& clustering_key,
|
||||
const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
||||
auto c = composite::from_exploded(column_names, marker);
|
||||
auto ck_bview = bytes_view(clustering_key);
|
||||
// The marker is not a component, so if the last component is empty (IOW,
|
||||
// only serializes to the marker), then we just replace the key's last byte
|
||||
// with the marker. If the component however it is not empty, then the
|
||||
// marker should be in the end of it, and we just join them together as we
|
||||
// do for any normal component
|
||||
if (c.size() == 1) {
|
||||
ck_bview.remove_suffix(1);
|
||||
}
|
||||
size_t sz = ck_bview.size() + c.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
bytes colname(bytes::initialized_later(), sz);
|
||||
std::copy(ck_bview.begin(), ck_bview.end(), colname.begin());
|
||||
std::copy(c.get_bytes().begin(), c.get_bytes().end(), colname.begin() + ck_bview.size());
|
||||
return colname;
|
||||
}
|
||||
|
||||
// Call maybe_flush_pi_block() before writing the given sstable atom to the
|
||||
// output. This may start a new promoted-index block depending on how much
|
||||
// data we've already written since the start of the current block. Starting
|
||||
// a new block involves both outputting the range of the old block to the
|
||||
// index file, and outputting again the currently-open range tombstones to
|
||||
// the data file.
|
||||
// TODO: currently, maybe_flush_pi_block serializes the column name on every
|
||||
// call, saving it in _pi_write.block_last_colname which we need for closing
|
||||
// each block, as well as for closing the last block. We could instead save
|
||||
// just the unprocessed arguments, and serialize them only when needed at the
|
||||
// end of the block. For this we would need this function to take rvalue
|
||||
// references (so data is moved in), and need not to use vector of byte_view
|
||||
// (which might be gone later).
|
||||
void sstable::maybe_flush_pi_block(file_writer& out,
|
||||
const composite& clustering_key,
|
||||
const std::vector<bytes_view>& column_names) {
|
||||
bytes colname = serialize_colname(clustering_key, column_names, composite::eoc::none);
|
||||
if (_pi_write.block_first_colname.empty()) {
|
||||
// This is the first column in the partition, or first column since we
|
||||
// closed a promoted-index block. Remember its name and position -
|
||||
// we'll need to write it to the promoted index.
|
||||
_pi_write.block_start_offset = out.offset();
|
||||
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
||||
_pi_write.block_first_colname = colname;
|
||||
_pi_write.block_last_colname = std::move(colname);
|
||||
} else if (out.offset() >= _pi_write.block_next_start_offset) {
|
||||
// If we wrote enough bytes to the partition since we output a sample
|
||||
// to the promoted index, output one now and start a new one.
|
||||
output_promoted_index_entry(_pi_write.data,
|
||||
_pi_write.block_first_colname,
|
||||
_pi_write.block_last_colname,
|
||||
_pi_write.block_start_offset - _c_stats.start_offset,
|
||||
out.offset() - _pi_write.block_start_offset);
|
||||
_pi_write.numblocks++;
|
||||
_pi_write.block_start_offset = out.offset();
|
||||
// Because the new block can be read without the previous blocks, we
|
||||
// need to repeat the range tombstones which are still open.
|
||||
// Note that block_start_offset is before outputting those (so the new
|
||||
// block includes them), but we set block_next_start_offset after - so
|
||||
// even if we wrote a lot of open tombstones, we still get a full
|
||||
// block size of new data.
|
||||
if (!clustering_key.empty()) {
|
||||
auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
||||
clustering_key_prefix(clustering_key.values()));
|
||||
for (const auto& rt : rts) {
|
||||
auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
|
||||
auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
|
||||
write_range_tombstone(out,
|
||||
start, rt.start_kind, end, rt.end_kind, {}, rt.tomb);
|
||||
}
|
||||
}
|
||||
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
||||
_pi_write.block_first_colname = colname;
|
||||
_pi_write.block_last_colname = std::move(colname);
|
||||
} else {
|
||||
// Keep track of the last column in the partition - we'll need it to close
|
||||
// the last block in the promoted index, unfortunately.
|
||||
_pi_write.block_last_colname = std::move(colname);
|
||||
}
|
||||
}
|
||||
|
||||
// @clustering_key: it's expected that clustering key is already in its composite form.
|
||||
// NOTE: empty clustering key means that there is no clustering key.
|
||||
void sstable::write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
||||
// FIXME: min_components and max_components also keep track of clustering
|
||||
// prefix, so we must merge clustering_key and column_names somehow and
|
||||
// pass the result to the functions below.
|
||||
column_name_helper::min_max_components(_c_stats.min_column_names, _c_stats.max_column_names, column_names);
|
||||
|
||||
// was defined in the schema, for example.
|
||||
auto c = composite::from_exploded(column_names, marker);
|
||||
auto ck_bview = bytes_view(clustering_key);
|
||||
@@ -1095,8 +1200,6 @@ void sstable::write_column_name(file_writer& out, const composite& clustering_ke
|
||||
}
|
||||
|
||||
void sstable::write_column_name(file_writer& out, bytes_view column_names) {
|
||||
column_name_helper::min_max_components(_c_stats.min_column_names, _c_stats.max_column_names, { column_names });
|
||||
|
||||
size_t sz = column_names.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
@@ -1223,6 +1326,7 @@ void sstable::write_collection(file_writer& out, const composite& clustering_key
|
||||
const bytes& column_name = cdef.name();
|
||||
write_range_tombstone(out, clustering_key, clustering_key, { bytes_view(column_name) }, mview.tomb);
|
||||
for (auto& cp: mview.cells) {
|
||||
maybe_flush_pi_block(out, clustering_key, { column_name, cp.first });
|
||||
write_column_name(out, clustering_key, { column_name, cp.first });
|
||||
write_cell(out, cp.second);
|
||||
}
|
||||
@@ -1234,11 +1338,27 @@ void sstable::write_clustered_row(file_writer& out, const schema& schema, const
|
||||
auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());
|
||||
|
||||
if (schema.is_compound() && !schema.is_dense()) {
|
||||
maybe_flush_pi_block(out, clustering_key, { bytes_view() });
|
||||
write_row_marker(out, clustered_row.marker(), clustering_key);
|
||||
}
|
||||
// Before writing cells, range tombstone must be written if the row has any (deletable_row::t).
|
||||
if (clustered_row.tomb()) {
|
||||
maybe_flush_pi_block(out, clustering_key, {});
|
||||
write_range_tombstone(out, clustering_key, clustering_key, {}, clustered_row.tomb());
|
||||
// Because we currently may break a partition to promoted-index blocks
|
||||
// in the middle of a clustered row, we also need to track the current
|
||||
// row's tombstone - not just range tombstones - which may effect the
|
||||
// beginning of a new block.
|
||||
// TODO: consider starting a new block only between rows, so the
|
||||
// following code can be dropped:
|
||||
_pi_write.tombstone_accumulator->apply(range_tombstone(
|
||||
clustered_row.key(), bound_kind::incl_start,
|
||||
clustered_row.key(), bound_kind::incl_end, clustered_row.tomb()));
|
||||
}
|
||||
|
||||
if (schema.clustering_key_size()) {
|
||||
column_name_helper::min_max_components(schema, _collector.min_column_names(), _collector.max_column_names(),
|
||||
clustered_row.key().components());
|
||||
}
|
||||
|
||||
// Write all cells of a partition's row.
|
||||
@@ -1256,14 +1376,18 @@ void sstable::write_clustered_row(file_writer& out, const schema& schema, const
|
||||
|
||||
if (schema.is_compound()) {
|
||||
if (schema.is_dense()) {
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(clustering_key) });
|
||||
write_column_name(out, bytes_view(clustering_key));
|
||||
} else {
|
||||
maybe_flush_pi_block(out, clustering_key, { bytes_view(column_name) });
|
||||
write_column_name(out, clustering_key, { bytes_view(column_name) });
|
||||
}
|
||||
} else {
|
||||
if (schema.is_dense()) {
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(clustered_row.key().get_component(schema, 0)) });
|
||||
write_column_name(out, bytes_view(clustered_row.key().get_component(schema, 0)));
|
||||
} else {
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
|
||||
write_column_name(out, bytes_view(column_name));
|
||||
}
|
||||
}
|
||||
@@ -1282,16 +1406,25 @@ void sstable::write_static_row(file_writer& out, const schema& schema, const row
|
||||
assert(column_definition.is_static());
|
||||
atomic_cell_view cell = c.as_atomic_cell();
|
||||
auto sp = composite::static_prefix(schema);
|
||||
maybe_flush_pi_block(out, sp, { bytes_view(column_definition.name()) });
|
||||
write_column_name(out, sp, { bytes_view(column_definition.name()) });
|
||||
write_cell(out, cell);
|
||||
});
|
||||
}
|
||||
|
||||
static void write_index_entry(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
|
||||
// FIXME: support promoted indexes.
|
||||
uint32_t promoted_index_size = 0;
|
||||
static void write_index_header(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
|
||||
write(out, key, pos);
|
||||
}
|
||||
|
||||
write(out, key, pos, promoted_index_size);
|
||||
static void write_index_promoted(file_writer& out, bytes_ostream& promoted_index,
|
||||
deletion_time deltime, uint32_t numblocks) {
|
||||
uint32_t promoted_index_size = promoted_index.size();
|
||||
if (promoted_index_size) {
|
||||
promoted_index_size += 16 /* deltime + numblocks */;
|
||||
write(out, promoted_index_size, deltime, numblocks, promoted_index);
|
||||
} else {
|
||||
write(out, promoted_index_size);
|
||||
}
|
||||
}
|
||||
|
||||
static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
|
||||
@@ -1405,6 +1538,18 @@ file_writer components_writer::index_file_writer(sstable& sst, const io_priority
|
||||
return file_writer(sst._index_file, std::move(options));
|
||||
}
|
||||
|
||||
// Get the currently loaded configuration, or the default configuration in
|
||||
// case none has been loaded (this happens, for example, in unit tests).
|
||||
static const db::config& get_config() {
|
||||
if (service::get_storage_service().local_is_initialized() &&
|
||||
service::get_local_storage_service().db().local_is_initialized()) {
|
||||
return service::get_local_storage_service().db().local().get_config();
|
||||
} else {
|
||||
static db::config default_config;
|
||||
return default_config;
|
||||
}
|
||||
}
|
||||
|
||||
components_writer::components_writer(sstable& sst, const schema& s, file_writer& out,
|
||||
uint64_t estimated_partitions, uint64_t max_sstable_size,
|
||||
const io_priority_class& pc)
|
||||
@@ -1413,8 +1558,10 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
|
||||
, _out(out)
|
||||
, _index(index_file_writer(sst, pc))
|
||||
, _max_sstable_size(max_sstable_size)
|
||||
, _tombstone_written(false)
|
||||
{
|
||||
_sst._filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
|
||||
_sst._pi_write.desired_block_size = get_config().column_index_size_in_kb() * 1024;
|
||||
|
||||
prepare_summary(_sst._summary, estimated_partitions, _schema.min_index_interval());
|
||||
|
||||
@@ -1435,7 +1582,17 @@ void components_writer::consume_new_partition(const dht::decorated_key& dk) {
|
||||
p_key.value = bytes_view(*_partition_key);
|
||||
|
||||
// Write index file entry from partition key into index file.
|
||||
write_index_entry(_index, p_key, _out.offset());
|
||||
// Write an index entry minus the "promoted index" (sample of columns)
|
||||
// part. We can only write that after processing the entire partition
|
||||
// and collecting the sample of columns.
|
||||
write_index_header(_index, p_key, _out.offset());
|
||||
_sst._pi_write.data = {};
|
||||
_sst._pi_write.numblocks = 0;
|
||||
_sst._pi_write.deltime.local_deletion_time = std::numeric_limits<int32_t>::max();
|
||||
_sst._pi_write.deltime.marked_for_delete_at = std::numeric_limits<int64_t>::min();
|
||||
_sst._pi_write.block_start_offset = _out.offset();
|
||||
_sst._pi_write.tombstone_accumulator = range_tombstone_accumulator(_schema, false);
|
||||
_sst._pi_write.schemap = &_schema; // sadly we need this
|
||||
|
||||
// Write partition key into data file.
|
||||
write(_out, p_key);
|
||||
@@ -1461,6 +1618,8 @@ void components_writer::consume(tombstone t) {
|
||||
}
|
||||
write(_out, d);
|
||||
_tombstone_written = true;
|
||||
// TODO: need to verify we don't do this twice?
|
||||
_sst._pi_write.deltime = d;
|
||||
}
|
||||
|
||||
stop_iteration components_writer::consume(static_row&& sr) {
|
||||
@@ -1477,13 +1636,35 @@ stop_iteration components_writer::consume(clustering_row&& cr) {
|
||||
|
||||
stop_iteration components_writer::consume(range_tombstone&& rt) {
|
||||
ensure_tombstone_is_written();
|
||||
// Remember the range tombstone so when we need to open a new promoted
|
||||
// index block, we can figure out which ranges are still open and need
|
||||
// to be repeated in the data file. Note that apply() also drops ranges
|
||||
// already closed by rt.start, so the accumulator doesn't grow boundless.
|
||||
_sst._pi_write.tombstone_accumulator->apply(rt);
|
||||
auto start = composite::from_clustering_element(_schema, std::move(rt.start));
|
||||
auto end = composite::from_clustering_element(_schema, std::move(rt.end));
|
||||
_sst.maybe_flush_pi_block(_out, start, {});
|
||||
_sst.write_range_tombstone(_out, std::move(start), rt.start_kind, std::move(end), rt.end_kind, {}, rt.tomb);
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration components_writer::consume_end_of_partition() {
|
||||
// If there is an incomplete block in the promoted index, write it too.
|
||||
// However, if the _promoted_index is still empty, don't add a single
|
||||
// chunk - better not output a promoted index at all in this case.
|
||||
if (!_sst._pi_write.data.empty() && !_sst._pi_write.block_first_colname.empty()) {
|
||||
output_promoted_index_entry(_sst._pi_write.data,
|
||||
_sst._pi_write.block_first_colname,
|
||||
_sst._pi_write.block_last_colname,
|
||||
_sst._pi_write.block_start_offset - _sst._c_stats.start_offset,
|
||||
_out.offset() - _sst._pi_write.block_start_offset);
|
||||
_sst._pi_write.numblocks++;
|
||||
}
|
||||
write_index_promoted(_index, _sst._pi_write.data, _sst._pi_write.deltime,
|
||||
_sst._pi_write.numblocks);
|
||||
_sst._pi_write.data = {};
|
||||
_sst._pi_write.block_first_colname = {};
|
||||
|
||||
ensure_tombstone_is_written();
|
||||
int16_t end_of_row = 0;
|
||||
write(_out, end_of_row);
|
||||
@@ -1491,7 +1672,7 @@ stop_iteration components_writer::consume_end_of_partition() {
|
||||
// compute size of the current row.
|
||||
_sst._c_stats.row_size = _out.offset() - _sst._c_stats.start_offset;
|
||||
// update is about merging column_stats with the data being stored by collector.
|
||||
_sst._collector.update(std::move(_sst._c_stats));
|
||||
_sst._collector.update(_schema, std::move(_sst._c_stats));
|
||||
_sst._c_stats.reset();
|
||||
|
||||
if (!_first_key) {
|
||||
|
||||
@@ -113,7 +113,7 @@ class sstable_writer;
|
||||
|
||||
using index_list = std::vector<index_entry>;
|
||||
|
||||
class sstable {
|
||||
class sstable : public enable_lw_shared_from_this<sstable> {
|
||||
public:
|
||||
enum class component_type {
|
||||
Index,
|
||||
@@ -155,6 +155,41 @@ public:
|
||||
// object lives until then (e.g., using the do_with() idiom).
|
||||
future<> data_consume_rows_at_once(row_consumer& consumer, uint64_t pos, uint64_t end);
|
||||
|
||||
// disk_read_range describes a byte ranges covering part of an sstable
|
||||
// row that we need to read from disk. Usually this is the whole byte
|
||||
// range covering a single sstable row, but in very large rows we might
|
||||
// want to only read a subset of the atoms which we know contains the
|
||||
// columns we are looking for. When the range to be read does NOT include
|
||||
// the entire row, the caller needs to supply the optional "row_info"
|
||||
// containing information about the entire row (key and deletion time)
|
||||
// which is normally read from the beginning of the row.
|
||||
struct disk_read_range {
|
||||
// TODO: this should become a vector of ranges
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
// When the range above does not cover the beginning of the sstable
|
||||
// row, we need to supply information which is only available at the
|
||||
// beginning of the row - the row's key and its tombstone if any.
|
||||
struct row_info {
|
||||
key k;
|
||||
deletion_time deltime;
|
||||
};
|
||||
std::experimental::optional<row_info> ri;
|
||||
disk_read_range() : start(0), end(0) {}
|
||||
disk_read_range(uint64_t start, uint64_t end) :
|
||||
start(start), end(end) { }
|
||||
disk_read_range(uint64_t start, uint64_t end, const key& key, const deletion_time& deltime) :
|
||||
start(start), end(end), ri(row_info{key, deltime}) { }
|
||||
explicit operator bool() const {
|
||||
return start != end;
|
||||
}
|
||||
// found_row() is true if the row was found. This is not the same as
|
||||
// operator bool(): It is possible that found_row() but the promoted
|
||||
// index ruled out anything to read (in this case "ri" was set).
|
||||
bool found_row() const {
|
||||
return start != end || ri;
|
||||
}
|
||||
};
|
||||
|
||||
// data_consume_rows() iterates over rows in the data file from
|
||||
// a particular range, feeding them into the consumer. The iteration is
|
||||
@@ -172,7 +207,7 @@ public:
|
||||
// The caller must ensure (e.g., using do_with()) that the context object,
|
||||
// as well as the sstable, remains alive as long as a read() is in
|
||||
// progress (i.e., returned a future which hasn't completed yet).
|
||||
data_consume_context data_consume_rows(row_consumer& consumer, uint64_t start, uint64_t end);
|
||||
data_consume_context data_consume_rows(row_consumer& consumer, disk_read_range toread);
|
||||
|
||||
// Like data_consume_rows() with bounds, but iterates over whole range
|
||||
data_consume_context data_consume_rows(row_consumer& consumer);
|
||||
@@ -196,6 +231,11 @@ public:
|
||||
return _generation;
|
||||
}
|
||||
|
||||
// read_row() reads the entire sstable row (partition) at a given
|
||||
// partition key k, or a subset of this row. The subset is defined by
|
||||
// a filter on the clustering keys which we want to read, which
|
||||
// additionally determines also if all the static columns will also be
|
||||
// returned in the result.
|
||||
future<streamed_mutation_opt> read_row(
|
||||
schema_ptr schema,
|
||||
const key& k,
|
||||
@@ -372,6 +412,27 @@ private:
|
||||
uint64_t _filter_file_size = 0;
|
||||
uint64_t _bytes_on_disk = 0;
|
||||
|
||||
// _pi_write is used temporarily for building the promoted
|
||||
// index (column sample) of one partition when writing a new sstable.
|
||||
struct {
|
||||
// Unfortunately we cannot output the promoted index directly to the
|
||||
// index file because it needs to be prepended by its size.
|
||||
bytes_ostream data;
|
||||
uint32_t numblocks;
|
||||
deletion_time deltime;
|
||||
uint64_t block_start_offset;
|
||||
uint64_t block_next_start_offset;
|
||||
bytes block_first_colname;
|
||||
bytes block_last_colname;
|
||||
std::experimental::optional<range_tombstone_accumulator> tombstone_accumulator;
|
||||
const schema* schemap;
|
||||
size_t desired_block_size;
|
||||
} _pi_write;
|
||||
|
||||
void maybe_flush_pi_block(file_writer& out,
|
||||
const composite& clustering_key,
|
||||
const std::vector<bytes_view>& column_names);
|
||||
|
||||
sstring _ks;
|
||||
sstring _cf;
|
||||
sstring _dir;
|
||||
@@ -471,6 +532,19 @@ private:
|
||||
// The ring_position doesn't have to survive deferring.
|
||||
future<uint64_t> upper_bound(schema_ptr, const dht::ring_position&, const io_priority_class& pc);
|
||||
|
||||
// find_disk_ranges finds the ranges of bytes we need to read from the
|
||||
// sstable to read the desired columns out of the given key. This range
|
||||
// may be the entire byte range of the given partition - as found using
|
||||
// the summary and index files - but if the index contains a "promoted
|
||||
// index" (a sample of column positions for each key) it may be a smaller
|
||||
// range. The returned range may contain columns beyond those requested
|
||||
// in ck_filtering, so it is the reader's duty to use ck_filtering again
|
||||
// when parsing the data read from the returned range.
|
||||
future<disk_read_range> find_disk_ranges(schema_ptr schema,
|
||||
const sstables::key& key,
|
||||
query::clustering_key_filtering_context ck_filtering,
|
||||
const io_priority_class& pc);
|
||||
|
||||
future<summary_entry&> read_summary_entry(size_t i);
|
||||
|
||||
// FIXME: pending on Bloom filter implementation
|
||||
|
||||
@@ -82,6 +82,10 @@ public:
|
||||
return _position;
|
||||
}
|
||||
|
||||
bytes_view get_promoted_index_bytes() const {
|
||||
return to_bytes_view(_promoted_index);
|
||||
}
|
||||
|
||||
index_entry(temporary_buffer<char>&& key, uint64_t position, temporary_buffer<char>&& promoted_index)
|
||||
: _key(std::move(key)), _position(position), _promoted_index(std::move(promoted_index)) {}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
|
||||
#include <stack>
|
||||
#include <boost/range/algorithm/heap_algorithm.hpp>
|
||||
#include <seastar/util/defer.hh>
|
||||
|
||||
#include "mutation.hh"
|
||||
#include "streamed_mutation.hh"
|
||||
@@ -116,6 +117,16 @@ std::ostream& operator<<(std::ostream& os, const streamed_mutation& sm) {
|
||||
return os;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, mutation_fragment::kind k)
|
||||
{
|
||||
switch (k) {
|
||||
case mutation_fragment::kind::static_row: return os << "static row";
|
||||
case mutation_fragment::kind::clustering_row: return os << "clustering row";
|
||||
case mutation_fragment::kind::range_tombstone: return os << "range tombstone";
|
||||
}
|
||||
abort();
|
||||
}
|
||||
|
||||
streamed_mutation streamed_mutation_from_mutation(mutation m)
|
||||
{
|
||||
class reader final : public streamed_mutation::impl {
|
||||
@@ -129,16 +140,16 @@ streamed_mutation streamed_mutation_from_mutation(mutation m)
|
||||
auto& crs = _mutation.partition().clustered_rows();
|
||||
auto re = crs.unlink_leftmost_without_rebalance();
|
||||
if (re) {
|
||||
auto re_deleter = defer([re] { current_deleter<rows_entry>()(re); });
|
||||
_cr = mutation_fragment(std::move(*re));
|
||||
current_deleter<rows_entry>()(re);
|
||||
}
|
||||
}
|
||||
void prepare_next_range_tombstone() {
|
||||
auto& rts = _mutation.partition().row_tombstones().tombstones();
|
||||
auto rt = rts.unlink_leftmost_without_rebalance();
|
||||
if (rt) {
|
||||
auto rt_deleter = defer([rt] { current_deleter<range_tombstone>()(rt); });
|
||||
_rt = mutation_fragment(std::move(*rt));
|
||||
current_deleter<range_tombstone>()(rt);
|
||||
}
|
||||
}
|
||||
mutation_fragment_opt read_next() {
|
||||
@@ -182,6 +193,27 @@ streamed_mutation streamed_mutation_from_mutation(mutation m)
|
||||
do_fill_buffer();
|
||||
}
|
||||
|
||||
~reader() {
|
||||
// After unlink_leftmost_without_rebalance() was called on a bi::set
|
||||
// we need to complete destroying the tree using that function.
|
||||
// clear_and_dispose() used by mutation_partition destructor won't
|
||||
// work properly.
|
||||
|
||||
auto& crs = _mutation.partition().clustered_rows();
|
||||
auto re = crs.unlink_leftmost_without_rebalance();
|
||||
while (re) {
|
||||
current_deleter<rows_entry>()(re);
|
||||
re = crs.unlink_leftmost_without_rebalance();
|
||||
}
|
||||
|
||||
auto& rts = _mutation.partition().row_tombstones().tombstones();
|
||||
auto rt = rts.unlink_leftmost_without_rebalance();
|
||||
while (rt) {
|
||||
current_deleter<range_tombstone>()(rt);
|
||||
rt = rts.unlink_leftmost_without_rebalance();
|
||||
}
|
||||
}
|
||||
|
||||
virtual future<> fill_buffer() override {
|
||||
do_fill_buffer();
|
||||
return make_ready_future<>();
|
||||
@@ -401,4 +433,5 @@ streamed_mutation reverse_streamed_mutation(streamed_mutation sm) {
|
||||
};
|
||||
|
||||
return make_streamed_mutation<reversing_steamed_mutation>(std::move(sm));
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -249,6 +249,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream&, mutation_fragment::kind);
|
||||
|
||||
class position_in_partition {
|
||||
int _bound_weight = 0;
|
||||
stdx::optional<clustering_key_prefix> _ck;
|
||||
|
||||
@@ -43,7 +43,13 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
|
||||
auto& db = e.local_db();
|
||||
auto s = db.find_schema("ks", "cf");
|
||||
std::vector<query::partition_range> pranges;
|
||||
for (uint32_t i = 1; i <= 5; ++i) {
|
||||
for (uint32_t i = 1; i <= 3; ++i) {
|
||||
auto pkey = partition_key::from_single_value(*s, to_bytes(sprint("key%d", i)));
|
||||
mutation m(pkey, s);
|
||||
m.partition().apply(tombstone(api::timestamp_type(1), gc_clock::now()));
|
||||
db.apply(s, freeze(m)).get();
|
||||
}
|
||||
for (uint32_t i = 3; i <= 8; ++i) {
|
||||
auto pkey = partition_key::from_single_value(*s, to_bytes(sprint("key%d", i)));
|
||||
mutation m(pkey, s);
|
||||
m.set_clustered_cell(clustering_key_prefix::make_empty(), "v", data_value(bytes("v1")), 1);
|
||||
@@ -51,9 +57,22 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
|
||||
pranges.emplace_back(query::partition_range::make_singular(dht::global_partitioner().decorate_key(*s, std::move(pkey))));
|
||||
}
|
||||
|
||||
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3);
|
||||
{
|
||||
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3);
|
||||
auto result = db.query(s, cmd, query::result_request::only_result, pranges).get0();
|
||||
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
|
||||
}
|
||||
|
||||
{
|
||||
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
|
||||
query::max_rows, gc_clock::now(), std::experimental::nullopt, 5);
|
||||
auto result = db.query(s, cmd, query::result_request::only_result, pranges).get0();
|
||||
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
|
||||
}
|
||||
|
||||
{
|
||||
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
|
||||
query::max_rows, gc_clock::now(), std::experimental::nullopt, 3);
|
||||
auto result = db.query(s, cmd, query::result_request::only_result, pranges).get0();
|
||||
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ int main(int argc, char** argv) {
|
||||
auto objects_in_batch = app.configuration()["batch"].as<unsigned>();
|
||||
|
||||
return seastar::async([obj_size, obj_count, objects_in_batch] {
|
||||
std::deque<managed_bytes> refs;
|
||||
chunked_fifo<managed_bytes> refs;
|
||||
logalloc::region r;
|
||||
|
||||
with_allocator(r.allocator(), [&] {
|
||||
|
||||
@@ -110,3 +110,55 @@ mutation_opt_assertions assert_that(streamed_mutation_opt smo) {
|
||||
return { std::move(mo) };
|
||||
}
|
||||
|
||||
class streamed_mutation_assertions {
|
||||
streamed_mutation _sm;
|
||||
clustering_key::equality _ck_eq;
|
||||
public:
|
||||
streamed_mutation_assertions(streamed_mutation sm)
|
||||
: _sm(std::move(sm)), _ck_eq(*_sm.schema()) { }
|
||||
|
||||
streamed_mutation_assertions& produces_static_row() {
|
||||
auto mfopt = _sm().get0();
|
||||
if (!mfopt) {
|
||||
BOOST_FAIL("Expected static row, got end of stream");
|
||||
}
|
||||
if (mfopt->mutation_fragment_kind() != mutation_fragment::kind::static_row) {
|
||||
BOOST_FAIL(sprint("Expected static row, got: %s", mfopt->mutation_fragment_kind()));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
streamed_mutation_assertions& produces(mutation_fragment::kind k, std::vector<int> ck_elements) {
|
||||
std::vector<bytes> ck_bytes;
|
||||
for (auto&& e : ck_elements) {
|
||||
ck_bytes.emplace_back(int32_type->decompose(e));
|
||||
}
|
||||
auto ck = clustering_key_prefix::from_exploded(*_sm.schema(), std::move(ck_bytes));
|
||||
|
||||
auto mfopt = _sm().get0();
|
||||
if (!mfopt) {
|
||||
BOOST_FAIL(sprint("Expected mutation fragment %s, got end of stream", ck));
|
||||
}
|
||||
if (mfopt->mutation_fragment_kind() != k) {
|
||||
BOOST_FAIL(sprint("Expected mutation fragment kind %s, got: %s", k, mfopt->mutation_fragment_kind()));
|
||||
}
|
||||
if (!_ck_eq(mfopt->key(), ck)) {
|
||||
BOOST_FAIL(sprint("Expected key %s, got: %s", ck, mfopt->key()));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
streamed_mutation_assertions& produces_end_of_stream() {
|
||||
auto mfopt = _sm().get0();
|
||||
BOOST_REQUIRE(!mfopt);
|
||||
if (mfopt) {
|
||||
BOOST_FAIL(sprint("Expected end of stream, got: %s", mfopt->mutation_fragment_kind()));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
static inline streamed_mutation_assertions assert_that_stream(streamed_mutation sm)
|
||||
{
|
||||
return streamed_mutation_assertions(std::move(sm));
|
||||
}
|
||||
@@ -483,3 +483,49 @@ SEASTAR_TEST_CASE(test_result_row_count) {
|
||||
BOOST_REQUIRE_EQUAL(r.row_count().value(), 3);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_partition_limit) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto s = make_schema();
|
||||
auto now = gc_clock::now();
|
||||
|
||||
mutation m1(partition_key::from_single_value(*s, "key1"), s);
|
||||
m1.partition().apply(tombstone(api::timestamp_type(1), now));
|
||||
mutation m2(partition_key::from_single_value(*s, "key2"), s);
|
||||
m2.set_clustered_cell(clustering_key::from_single_value(*s, bytes("A")), "v1", data_value(bytes("A:v")), 1);
|
||||
mutation m3(partition_key::from_single_value(*s, "key3"), s);
|
||||
m3.set_clustered_cell(clustering_key::from_single_value(*s, bytes("B")), "v1", data_value(bytes("B:v")), 1);
|
||||
|
||||
auto src = make_source({m1, m2, m3});
|
||||
auto slice = make_full_slice(*s);
|
||||
|
||||
{
|
||||
reconcilable_result result = mutation_query(s, src,
|
||||
query::full_partition_range, slice, query::max_rows, 10, now).get0();
|
||||
|
||||
assert_that(to_result_set(result, s, slice))
|
||||
.has_size(2)
|
||||
.has(a_row()
|
||||
.with_column("pk", data_value(bytes("key2")))
|
||||
.with_column("ck", data_value(bytes("A")))
|
||||
.with_column("v1", data_value(bytes("A:v"))))
|
||||
.has(a_row()
|
||||
.with_column("pk", data_value(bytes("key3")))
|
||||
.with_column("ck", data_value(bytes("B")))
|
||||
.with_column("v1", data_value(bytes("B:v"))));
|
||||
}
|
||||
|
||||
{
|
||||
reconcilable_result result = mutation_query(s, src,
|
||||
query::full_partition_range, slice, query::max_rows, 1, now).get0();
|
||||
|
||||
assert_that(to_result_set(result, s, slice))
|
||||
.has_size(1)
|
||||
.has(a_row()
|
||||
.with_column("pk", data_value(bytes("key2")))
|
||||
.with_column("ck", data_value(bytes("A")))
|
||||
.with_column("v1", data_value(bytes("A:v"))));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -112,6 +112,7 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying) {
|
||||
assert_that(cache.make_reader(s, query::full_partition_range))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
assert(tracker.uncached_wide_partitions() == 0);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -140,6 +141,58 @@ SEASTAR_TEST_CASE(test_cache_works_after_clearing) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_for_wide_partition_full_range) {
|
||||
return seastar::async([] {
|
||||
auto s = make_schema();
|
||||
auto m = make_new_mutation(s);
|
||||
int secondary_calls_count = 0;
|
||||
cache_tracker tracker;
|
||||
row_cache cache(s, mutation_source([&secondary_calls_count, &m] (schema_ptr s, const query::partition_range& range) {
|
||||
++secondary_calls_count;
|
||||
return make_reader_returning(m);
|
||||
}), key_source([&m] (auto&&) {
|
||||
return make_key_from_mutation_reader(make_reader_returning(m));
|
||||
}), tracker, 0);
|
||||
|
||||
assert_that(cache.make_reader(s, query::full_partition_range))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count, 2);
|
||||
BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 1);
|
||||
assert_that(cache.make_reader(s, query::full_partition_range))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count, 3);
|
||||
BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 2);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_for_wide_partition_single_partition) {
|
||||
return seastar::async([] {
|
||||
auto s = make_schema();
|
||||
auto m = make_new_mutation(s);
|
||||
int secondary_calls_count = 0;
|
||||
cache_tracker tracker;
|
||||
row_cache cache(s, mutation_source([&secondary_calls_count, &m] (schema_ptr s, const query::partition_range& range) {
|
||||
++secondary_calls_count;
|
||||
return make_reader_returning(m);
|
||||
}), key_source([&m] (auto&&) {
|
||||
return make_key_from_mutation_reader(make_reader_returning(m));
|
||||
}), tracker, 0);
|
||||
|
||||
assert_that(cache.make_reader(s, query::partition_range::make_singular(query::ring_position(m.decorated_key()))))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count, 2);
|
||||
BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 1);
|
||||
assert_that(cache.make_reader(s, query::partition_range::make_singular(query::ring_position(m.decorated_key()))))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count, 3);
|
||||
BOOST_REQUIRE_EQUAL(tracker.uncached_wide_partitions(), 2);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_empty_full_range) {
|
||||
return seastar::async([] {
|
||||
auto s = make_schema();
|
||||
@@ -154,10 +207,10 @@ SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_empty_full_range)
|
||||
|
||||
assert_that(cache.make_reader(s, query::full_partition_range))
|
||||
.produces_end_of_stream();
|
||||
assert(secondary_calls_count.load() == 1);
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count.load(), 1);
|
||||
assert_that(cache.make_reader(s, query::full_partition_range))
|
||||
.produces_end_of_stream();
|
||||
assert(secondary_calls_count.load() == 1);
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count.load(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -181,11 +234,11 @@ void test_cache_delegates_to_underlying_only_once_with_single_partition(schema_p
|
||||
assert_that(cache.make_reader(s, range))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
assert(secondary_calls_count.load() == 1);
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count.load(), 1);
|
||||
assert_that(cache.make_reader(s, range))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
assert(secondary_calls_count.load() == 1);
|
||||
BOOST_REQUIRE_EQUAL(secondary_calls_count.load(), 1);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_cache_delegates_to_underlying_only_once_single_key_range) {
|
||||
@@ -843,6 +896,60 @@ static key_source make_key_source(schema_ptr s, std::vector<lw_shared_ptr<memtab
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_continuity_flag_and_invalidate_race) {
|
||||
return seastar::async([] {
|
||||
auto s = make_schema();
|
||||
lw_shared_ptr<memtable> mt = make_lw_shared<memtable>(s);
|
||||
|
||||
cache_tracker tracker;
|
||||
row_cache cache(s, mt->as_data_source(), mt->as_key_source(), tracker);
|
||||
|
||||
auto ring = make_ring(s, 4);
|
||||
for (auto&& m : ring) {
|
||||
mt->apply(m);
|
||||
}
|
||||
|
||||
// Bring ring[2]and ring[3] to cache.
|
||||
assert_that(cache.make_reader(s, query::partition_range::make_starting_with({ ring[2].ring_position(), true })))
|
||||
.produces(ring[2])
|
||||
.produces(ring[3])
|
||||
.produces_end_of_stream();
|
||||
|
||||
// Start reader with full range.
|
||||
auto rd = assert_that(cache.make_reader(s, query::full_partition_range));
|
||||
rd.produces(ring[0]);
|
||||
|
||||
// Invalidate ring[2] and ring[3]
|
||||
cache.invalidate(query::partition_range::make_starting_with({ ring[2].ring_position(), true })).get();
|
||||
|
||||
// Continue previous reader.
|
||||
rd.produces(ring[1])
|
||||
.produces(ring[2])
|
||||
.produces(ring[3])
|
||||
.produces_end_of_stream();
|
||||
|
||||
// Start another reader with full range.
|
||||
rd = assert_that(cache.make_reader(s, query::full_partition_range));
|
||||
rd.produces(ring[0])
|
||||
.produces(ring[1])
|
||||
.produces(ring[2]);
|
||||
|
||||
// Invalidate whole cache.
|
||||
cache.clear().get();
|
||||
|
||||
rd.produces(ring[3])
|
||||
.produces_end_of_stream();
|
||||
|
||||
// Start yet another reader with full range.
|
||||
assert_that(cache.make_reader(s, query::full_partition_range))
|
||||
.produces(ring[0])
|
||||
.produces(ring[1])
|
||||
.produces(ring[2])
|
||||
.produces(ring[3])
|
||||
.produces_end_of_stream();;
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
|
||||
return seastar::async([] {
|
||||
auto s = make_schema();
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "range.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "sstables/date_tiered_compaction_strategy.hh"
|
||||
#include "mutation_assertions.hh"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ftw.h>
|
||||
@@ -2596,43 +2597,25 @@ SEASTAR_TEST_CASE(test_wrong_range_tombstone_order) {
|
||||
|
||||
auto smopt = reader().get0();
|
||||
BOOST_REQUIRE(smopt);
|
||||
auto& sm = *smopt;
|
||||
|
||||
using kind = mutation_fragment::kind;
|
||||
auto then_expect = [&] (kind k, std::vector<int> ck_elems) {
|
||||
std::vector<bytes> ck_bytes;
|
||||
for (auto&& e : ck_elems) {
|
||||
ck_bytes.emplace_back(int32_type->decompose(e));
|
||||
}
|
||||
auto ck = clustering_key_prefix::from_exploded(*s, std::move(ck_bytes));
|
||||
|
||||
auto mfopt = sm().get0();
|
||||
BOOST_REQUIRE(mfopt);
|
||||
if (mfopt->mutation_fragment_kind() != k) {
|
||||
abort();
|
||||
}
|
||||
BOOST_REQUIRE(mfopt->mutation_fragment_kind() == k);
|
||||
BOOST_REQUIRE(ck_eq(mfopt->key(), ck));
|
||||
};
|
||||
|
||||
then_expect(kind::range_tombstone, { 0 });
|
||||
then_expect(kind::clustering_row, { 1 });
|
||||
then_expect(kind::clustering_row, { 1, 1 });
|
||||
then_expect(kind::clustering_row, { 1, 2 });
|
||||
then_expect(kind::clustering_row, { 1, 2, 3 });
|
||||
then_expect(kind::range_tombstone, { 1, 3 });
|
||||
then_expect(kind::clustering_row, { 1, 3 });
|
||||
then_expect(kind::clustering_row, { 1, 3, 4 });
|
||||
then_expect(kind::clustering_row, { 1, 4 });
|
||||
then_expect(kind::clustering_row, { 1, 4, 0 });
|
||||
then_expect(kind::range_tombstone, { 2 });
|
||||
then_expect(kind::range_tombstone, { 2, 1 });
|
||||
then_expect(kind::range_tombstone, { 2, 1 });
|
||||
then_expect(kind::range_tombstone, { 2, 2 });
|
||||
then_expect(kind::range_tombstone, { 2, 2 });
|
||||
|
||||
auto mfopt = sm().get0();
|
||||
BOOST_REQUIRE(!mfopt);
|
||||
assert_that_stream(std::move(*smopt))
|
||||
.produces(kind::range_tombstone, { 0 })
|
||||
.produces(kind::clustering_row, { 1 })
|
||||
.produces(kind::clustering_row, { 1, 1 })
|
||||
.produces(kind::clustering_row, { 1, 2 })
|
||||
.produces(kind::clustering_row, { 1, 2, 3 })
|
||||
.produces(kind::range_tombstone, { 1, 3 })
|
||||
.produces(kind::clustering_row, { 1, 3 })
|
||||
.produces(kind::clustering_row, { 1, 3, 4 })
|
||||
.produces(kind::clustering_row, { 1, 4 })
|
||||
.produces(kind::clustering_row, { 1, 4, 0 })
|
||||
.produces(kind::range_tombstone, { 2 })
|
||||
.produces(kind::range_tombstone, { 2, 1 })
|
||||
.produces(kind::range_tombstone, { 2, 1 })
|
||||
.produces(kind::range_tombstone, { 2, 2 })
|
||||
.produces(kind::range_tombstone, { 2, 2 })
|
||||
.produces_end_of_stream();
|
||||
|
||||
smopt = reader().get0();
|
||||
BOOST_REQUIRE(!smopt);
|
||||
@@ -2790,3 +2773,253 @@ SEASTAR_TEST_CASE(basic_date_tiered_strategy_test) {
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(date_tiered_strategy_test_2) {
|
||||
auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
|
||||
{{"p1", utf8_type}}, {}, {}, {}, utf8_type));
|
||||
compaction_manager cm;
|
||||
column_family::config cfg;
|
||||
auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), cm);
|
||||
|
||||
// deterministic timestamp for Fri, 01 Jan 2016 00:00:00 GMT.
|
||||
auto tp = db_clock::from_time_t(1451606400);
|
||||
int64_t timestamp = tp.time_since_epoch().count() * 1000; // in microseconds.
|
||||
|
||||
std::vector<sstables::shared_sstable> candidates;
|
||||
int min_threshold = cf->schema()->min_compaction_threshold();
|
||||
|
||||
// add sstables that belong to same time window until min threshold is satisfied.
|
||||
for (auto i = 1; i <= min_threshold; i++) {
|
||||
auto sst = add_sstable_for_overlapping_test(cf, /*gen*/i, "a", "a",
|
||||
build_stats(timestamp, timestamp, std::numeric_limits<int32_t>::max()));
|
||||
candidates.push_back(sst);
|
||||
}
|
||||
// belongs to the time window
|
||||
auto tp2 = tp + std::chrono::seconds(1800);
|
||||
timestamp = tp2.time_since_epoch().count() * 1000;
|
||||
auto sst = add_sstable_for_overlapping_test(cf, /*gen*/min_threshold + 1, "a", "a",
|
||||
build_stats(timestamp, timestamp, std::numeric_limits<int32_t>::max()));
|
||||
candidates.push_back(sst);
|
||||
|
||||
// doesn't belong to the time window above
|
||||
auto tp3 = tp + std::chrono::seconds(4000);
|
||||
timestamp = tp3.time_since_epoch().count() * 1000;
|
||||
auto sst2 = add_sstable_for_overlapping_test(cf, /*gen*/min_threshold + 2, "a", "a",
|
||||
build_stats(timestamp, timestamp, std::numeric_limits<int32_t>::max()));
|
||||
candidates.push_back(sst2);
|
||||
|
||||
std::map<sstring, sstring> options;
|
||||
// Use a 1-hour time window.
|
||||
options.emplace(sstring("base_time_seconds"), sstring("3600"));
|
||||
|
||||
date_tiered_manifest manifest(options);
|
||||
auto gc_before = gc_clock::time_point(std::chrono::seconds(0)); // disable gc before.
|
||||
auto sstables = manifest.get_next_sstables(*cf, candidates, gc_before);
|
||||
std::unordered_set<int64_t> gens;
|
||||
for (auto sst : sstables) {
|
||||
gens.insert(sst->generation());
|
||||
}
|
||||
BOOST_REQUIRE(sstables.size() == size_t(min_threshold + 1));
|
||||
BOOST_REQUIRE(gens.count(min_threshold + 1));
|
||||
BOOST_REQUIRE(!gens.count(min_threshold + 2));
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_promoted_index_read) {
|
||||
// create table promoted_index_read (
|
||||
// pk int,
|
||||
// ck1 int,
|
||||
// ck2 int,
|
||||
// v int,
|
||||
// primary key (pk, ck1, ck2)
|
||||
// );
|
||||
//
|
||||
// column_index_size_in_kb: 0
|
||||
//
|
||||
// delete from promoted_index_read where pk = 0 and ck1 = 0;
|
||||
// insert into promoted_index_read (pk, ck1, ck2, v) values (0, 0, 0, 0);
|
||||
// insert into promoted_index_read (pk, ck1, ck2, v) values (0, 0, 1, 1);
|
||||
//
|
||||
// SSTable:
|
||||
// [
|
||||
// {"key": "0",
|
||||
// "cells": [["0:_","0:!",1468923292708929,"t",1468923292],
|
||||
// ["0:_","0:!",1468923292708929,"t",1468923292],
|
||||
// ["0:0:","",1468923308379491],
|
||||
// ["0:_","0:!",1468923292708929,"t",1468923292],
|
||||
// ["0:0:v","0",1468923308379491],
|
||||
// ["0:_","0:!",1468923292708929,"t",1468923292],
|
||||
// ["0:1:","",1468923311744298],
|
||||
// ["0:_","0:!",1468923292708929,"t",1468923292],
|
||||
// ["0:1:v","1",1468923311744298]]}
|
||||
// ]
|
||||
|
||||
return seastar::async([] {
|
||||
auto s = schema_builder("ks", "promoted_index_read")
|
||||
.with_column("pk", int32_type, column_kind::partition_key)
|
||||
.with_column("ck1", int32_type, column_kind::clustering_key)
|
||||
.with_column("ck2", int32_type, column_kind::clustering_key)
|
||||
.with_column("v", int32_type)
|
||||
.build();
|
||||
|
||||
auto sst = make_lw_shared<sstable>("ks", "promoted_index_read", "tests/sstables/promoted_index_read", 1, sstables::sstable::version_types::ka, big);
|
||||
sst->load().get0();
|
||||
|
||||
auto rd = sstable_reader(sst, s);
|
||||
auto smopt = rd().get0();
|
||||
BOOST_REQUIRE(smopt);
|
||||
|
||||
using kind = mutation_fragment::kind;
|
||||
assert_that_stream(std::move(*smopt))
|
||||
.produces(kind::range_tombstone, { 0 })
|
||||
.produces(kind::clustering_row, { 0, 0 })
|
||||
.produces(kind::clustering_row, { 0, 1 })
|
||||
.produces_end_of_stream();
|
||||
});
|
||||
}
|
||||
|
||||
static void check_min_max_column_names(const sstable_ptr& sst, std::vector<bytes> min_components, std::vector<bytes> max_components) {
|
||||
const auto& st = sst->get_stats_metadata();
|
||||
BOOST_REQUIRE(st.min_column_names.elements.size() == min_components.size());
|
||||
BOOST_REQUIRE(st.min_column_names.elements.size() == st.max_column_names.elements.size());
|
||||
for (auto i = 0U; i < st.min_column_names.elements.size(); i++) {
|
||||
BOOST_REQUIRE(min_components[i] == st.min_column_names.elements[i].value);
|
||||
BOOST_REQUIRE(max_components[i] == st.max_column_names.elements[i].value);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_min_max_clustering_key(schema_ptr s, std::vector<bytes> exploded_pk, std::vector<std::vector<bytes>> exploded_cks,
|
||||
std::vector<bytes> min_components, std::vector<bytes> max_components, bool remove = false) {
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
auto insert_data = [&mt, &s] (std::vector<bytes>& exploded_pk, std::vector<bytes>&& exploded_ck) {
|
||||
const column_definition& r1_col = *s->get_column_definition("r1");
|
||||
auto key = partition_key::from_exploded(*s, exploded_pk);
|
||||
auto c_key = clustering_key::make_empty();
|
||||
if (!exploded_ck.empty()) {
|
||||
c_key = clustering_key::from_exploded(*s, exploded_ck);
|
||||
}
|
||||
mutation m(key, s);
|
||||
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(int32_type->decompose(1)));
|
||||
mt->apply(std::move(m));
|
||||
};
|
||||
auto remove_data = [&mt, &s] (std::vector<bytes>& exploded_pk, std::vector<bytes>&& exploded_ck) {
|
||||
auto key = partition_key::from_exploded(*s, exploded_pk);
|
||||
auto c_key = clustering_key::from_exploded(*s, exploded_ck);
|
||||
mutation m(key, s);
|
||||
tombstone tomb(api::new_timestamp(), gc_clock::now());
|
||||
m.partition().apply_delete(*s, c_key, tomb);
|
||||
mt->apply(std::move(m));
|
||||
};
|
||||
|
||||
if (exploded_cks.empty()) {
|
||||
insert_data(exploded_pk, {});
|
||||
} else {
|
||||
for (auto& exploded_ck : exploded_cks) {
|
||||
if (remove) {
|
||||
remove_data(exploded_pk, std::move(exploded_ck));
|
||||
} else {
|
||||
insert_data(exploded_pk, std::move(exploded_ck));
|
||||
}
|
||||
}
|
||||
}
|
||||
auto tmp = make_lw_shared<tmpdir>();
|
||||
auto sst = make_lw_shared<sstable>("ks", "cf", tmp->path, 1, la, big);
|
||||
sst->write_components(*mt).get();
|
||||
sst = reusable_sst(tmp->path, 1).get0();
|
||||
check_min_max_column_names(sst, std::move(min_components), std::move(max_components));
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(min_max_clustering_key_test) {
|
||||
return seastar::async([] {
|
||||
{
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
.with_column("ck2", utf8_type, column_kind::clustering_key)
|
||||
.with_column("r1", int32_type)
|
||||
.build();
|
||||
test_min_max_clustering_key(s, { "key1" }, { { "a", "b" }, { "a", "c" } }, { "a", "b" }, { "a", "c" });
|
||||
}
|
||||
{
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with(schema_builder::compact_storage::yes)
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
.with_column("ck2", utf8_type, column_kind::clustering_key)
|
||||
.with_column("r1", int32_type)
|
||||
.build();
|
||||
test_min_max_clustering_key(s, { "key1" }, { { "a", "b" }, { "a", "c" } }, { "a", "b" }, { "a", "c" });
|
||||
}
|
||||
{
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
.with_column("r1", int32_type)
|
||||
.build();
|
||||
test_min_max_clustering_key(s, { "key1" }, { { "a" }, { "z" } }, { "a" }, { "z" });
|
||||
}
|
||||
{
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
.with_column("r1", int32_type)
|
||||
.build();
|
||||
test_min_max_clustering_key(s, { "key1" }, { { "a" }, { "z" } }, { "a" }, { "z" }, true);
|
||||
}
|
||||
{
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("r1", int32_type)
|
||||
.build();
|
||||
test_min_max_clustering_key(s, { "key1" }, {}, {}, {});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(min_max_clustering_key_test_2) {
|
||||
return seastar::async([] {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
.with_column("r1", int32_type)
|
||||
.build();
|
||||
auto cm = make_lw_shared<compaction_manager>();
|
||||
auto cf = make_lw_shared<column_family>(s, column_family::config(), column_family::no_commitlog(), *cm);
|
||||
auto tmp = make_lw_shared<tmpdir>();
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
const column_definition& r1_col = *s->get_column_definition("r1");
|
||||
|
||||
for (auto j = 0; j < 8; j++) {
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes("key" + to_sstring(j))});
|
||||
mutation m(key, s);
|
||||
for (auto i = 100; i < 150; i++) {
|
||||
auto c_key = clustering_key::from_exploded(*s, {to_bytes(to_sstring(j) + "ck" + to_sstring(i))});
|
||||
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(int32_type->decompose(1)));
|
||||
}
|
||||
mt->apply(std::move(m));
|
||||
}
|
||||
auto sst = make_lw_shared<sstable>("ks", "cf", tmp->path, 1, la, big);
|
||||
sst->write_components(*mt).get();
|
||||
sst = reusable_sst(tmp->path, 1).get0();
|
||||
check_min_max_column_names(sst, { "0ck100" }, { "7ck149" });
|
||||
|
||||
mt = make_lw_shared<memtable>(s);
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes("key9")});
|
||||
mutation m(key, s);
|
||||
for (auto i = 101; i < 299; i++) {
|
||||
auto c_key = clustering_key::from_exploded(*s, {to_bytes(to_sstring(9) + "ck" + to_sstring(i))});
|
||||
m.set_clustered_cell(c_key, r1_col, make_atomic_cell(int32_type->decompose(1)));
|
||||
}
|
||||
mt->apply(std::move(m));
|
||||
auto sst2 = make_lw_shared<sstable>("ks", "cf", tmp->path, 2, la, big);
|
||||
sst2->write_components(*mt).get();
|
||||
sst2 = reusable_sst(tmp->path, 2).get0();
|
||||
check_min_max_column_names(sst2, { "9ck101" }, { "9ck298" });
|
||||
|
||||
auto creator = [tmp] { return make_lw_shared<sstables::sstable>("ks", "cf", tmp->path, 3, la, big); };
|
||||
auto new_sstables = sstables::compact_sstables({ sst, sst2 }, *cf, creator, std::numeric_limits<uint64_t>::max(), 0).get0();
|
||||
BOOST_REQUIRE(new_sstables.size() == 1);
|
||||
check_min_max_column_names(new_sstables.front(), { "0ck100" }, { "9ck298" });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -404,14 +404,14 @@ SEASTAR_TEST_CASE(test_sstable_can_write_and_read_range_tombstone) {
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(std::move(m));
|
||||
|
||||
auto sst = sstables::sstable("ks", "cf",
|
||||
auto sst = make_lw_shared<sstables::sstable>("ks", "cf",
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::la,
|
||||
sstables::sstable::format_types::big);
|
||||
sst.write_components(*mt).get();
|
||||
sst.load().get();
|
||||
auto mr = sst.read_rows(s);
|
||||
sst->write_components(*mt).get();
|
||||
sst->load().get();
|
||||
auto mr = sst->read_rows(s);
|
||||
auto sm = mr.read().get0();
|
||||
auto mut = mutation_from_streamed_mutation(std::move(sm)).get0();
|
||||
BOOST_REQUIRE(bool(mut));
|
||||
@@ -793,3 +793,36 @@ SEASTAR_TEST_CASE(tombstone_in_tombstone2) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) {
|
||||
return seastar::async([] {
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("c", int32_type, column_kind::clustering_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto k = partition_key::from_exploded(*s, {to_bytes("key1")});
|
||||
auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(static_cast<int32_t>(0xffff0000))});
|
||||
|
||||
mutation m(k, s);
|
||||
auto cell = atomic_cell::make_live(1, int32_type->decompose(17), { });
|
||||
m.set_clustered_cell(ck, *s->get_column_definition("v"), std::move(cell));
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(std::move(m));
|
||||
|
||||
auto sst = make_lw_shared<sstables::sstable>("ks", "cf",
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sst->write_components(*mt).get();
|
||||
sst->load().get();
|
||||
auto mr = sst->read_rows(s);
|
||||
auto sm = mr.read().get0();
|
||||
auto mut = mutation_from_streamed_mutation(std::move(sm)).get0();
|
||||
BOOST_REQUIRE(bool(mut));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -37,6 +37,8 @@
|
||||
#include <memory>
|
||||
#include "sstable_test.hh"
|
||||
#include "tmpdir.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "tests/test_services.hh"
|
||||
|
||||
#include "disk-error-handler.hh"
|
||||
|
||||
@@ -447,7 +449,7 @@ SEASTAR_TEST_CASE(compressed_row_read_at_once) {
|
||||
SEASTAR_TEST_CASE(uncompressed_rows_read_one) {
|
||||
return reusable_sst("tests/sstables/uncompressed", 1).then([] (auto sstp) {
|
||||
return do_with(test_row_consumer(1418656871665302), [sstp] (auto& c) {
|
||||
auto context = sstp->data_consume_rows(c, 0, 95);
|
||||
auto context = sstp->data_consume_rows(c, {0, 95});
|
||||
auto fut = context.read();
|
||||
return fut.then([sstp, &c, context = std::move(context)] {
|
||||
BOOST_REQUIRE(c.count_row_start == 1);
|
||||
@@ -464,7 +466,7 @@ SEASTAR_TEST_CASE(uncompressed_rows_read_one) {
|
||||
SEASTAR_TEST_CASE(compressed_rows_read_one) {
|
||||
return reusable_sst("tests/sstables/compressed", 1).then([] (auto sstp) {
|
||||
return do_with(test_row_consumer(1418654707438005), [sstp] (auto& c) {
|
||||
auto context = sstp->data_consume_rows(c, 0, 95);
|
||||
auto context = sstp->data_consume_rows(c, {0, 95});
|
||||
auto fut = context.read();
|
||||
return fut.then([sstp, &c, context = std::move(context)] {
|
||||
BOOST_REQUIRE(c.count_row_start == 1);
|
||||
@@ -947,3 +949,274 @@ SEASTAR_TEST_CASE(statistics_rewrite) {
|
||||
});
|
||||
}, "tests/sstables/generation");
|
||||
}
|
||||
|
||||
// Tests for reading a large partition for which the index contains a
|
||||
// "promoted index", i.e., a sample of the column names inside the partition,
|
||||
// with which we can avoid reading the entire partition when we look only
|
||||
// for a specific subset of columns. The test sstable for the read test was
|
||||
// generated in Cassandra.
|
||||
|
||||
static schema_ptr large_partition_schema() {
|
||||
static thread_local auto s = [] {
|
||||
schema_builder builder(make_lw_shared(schema(
|
||||
generate_legacy_id("try1", "data"), "try1", "data",
|
||||
// partition key
|
||||
{{"t1", utf8_type}},
|
||||
// clustering key
|
||||
{{"t2", utf8_type}},
|
||||
// regular columns
|
||||
{{"t3", utf8_type}},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
""
|
||||
)));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
return s;
|
||||
}
|
||||
|
||||
static future<lw_shared_ptr<sstable>> load_large_partition_sst() {
|
||||
auto sst = make_lw_shared<sstable>(
|
||||
"try1", "data", "tests/sstables/large_partition", 3,
|
||||
sstables::sstable::version_types::ka, big);
|
||||
auto fut = sst->load();
|
||||
return std::move(fut).then([sst = std::move(sst)] {
|
||||
return std::move(sst);
|
||||
});
|
||||
}
|
||||
|
||||
// This is a rudimentary test that reads an sstable exported from Cassandra
|
||||
// which contains a promoted index. It just checks that the promoted index
|
||||
// is read from disk, as an unparsed array, and doesn't actually use it to
|
||||
// search for anything.
|
||||
SEASTAR_TEST_CASE(promoted_index_read) {
|
||||
return load_large_partition_sst().then([] (auto sstp) {
|
||||
schema_ptr s = large_partition_schema();
|
||||
return sstables::test(sstp).read_indexes(0).then([sstp] (index_list vec) {
|
||||
BOOST_REQUIRE(vec.size() == 1);
|
||||
index_entry &e = vec[0];
|
||||
BOOST_REQUIRE(e.get_promoted_index_bytes().size() == 468);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Use an empty string for ck1, ck2, or both, for unbounded ranges.
|
||||
static query::partition_slice make_partition_slice(const schema& s, sstring ck1, sstring ck2) {
|
||||
std::experimental::optional<query::clustering_range::bound> b1;
|
||||
if (!ck1.empty()) {
|
||||
b1.emplace(clustering_key_prefix::from_single_value(
|
||||
s, utf8_type->decompose(ck1)));
|
||||
}
|
||||
std::experimental::optional<query::clustering_range::bound> b2;
|
||||
if (!ck2.empty()) {
|
||||
b2.emplace(clustering_key_prefix::from_single_value(
|
||||
s, utf8_type->decompose(ck2)));
|
||||
}
|
||||
return partition_slice_builder(s).
|
||||
with_range(query::clustering_range(b1, b2)).build();
|
||||
}
|
||||
|
||||
// Count the number of CQL rows in one partition between clustering key
|
||||
// prefix ck1 to ck2.
|
||||
static future<int> count_rows(sstable_ptr sstp, schema_ptr s, sstring key, sstring ck1, sstring ck2) {
|
||||
return seastar::async([sstp, s, key, ck1, ck2] () mutable {
|
||||
auto ps = make_partition_slice(*s, ck1, ck2);
|
||||
auto row = sstp->read_row(s, sstables::key(key.c_str()),
|
||||
query::clustering_key_filtering_context::create(s, ps)).get0();
|
||||
if (!row) {
|
||||
return 0;
|
||||
}
|
||||
int nrows = 0;
|
||||
auto mfopt = (*row)().get0();
|
||||
while (mfopt) {
|
||||
if (mfopt->is_clustering_row()) {
|
||||
nrows++;
|
||||
}
|
||||
mfopt = (*row)().get0();
|
||||
}
|
||||
return nrows;
|
||||
});
|
||||
}
|
||||
|
||||
// Count the number of CQL rows in one partition
|
||||
static future<int> count_rows(sstable_ptr sstp, schema_ptr s, sstring key) {
|
||||
return seastar::async([sstp, s, key] () mutable {
|
||||
auto row = sstp->read_row(s, sstables::key(key.c_str())).get0();
|
||||
if (!row) {
|
||||
return 0;
|
||||
}
|
||||
int nrows = 0;
|
||||
auto mfopt = (*row)().get0();
|
||||
while (mfopt) {
|
||||
if (mfopt->is_clustering_row()) {
|
||||
nrows++;
|
||||
}
|
||||
mfopt = (*row)().get0();
|
||||
}
|
||||
return nrows;
|
||||
});
|
||||
}
|
||||
|
||||
// Count the number of CQL rows between clustering key prefix ck1 to ck2
|
||||
// in all partitions in the sstable (using sstable::read_range_rows).
|
||||
static future<int> count_rows(sstable_ptr sstp, schema_ptr s, sstring ck1, sstring ck2) {
|
||||
return seastar::async([sstp, s, ck1, ck2] () mutable {
|
||||
auto ps = make_partition_slice(*s, ck1, ck2);
|
||||
auto reader = sstp->read_range_rows(s, query::full_partition_range,
|
||||
query::clustering_key_filtering_context::create(s, ps));
|
||||
int nrows = 0;
|
||||
auto smopt = reader.read().get0();
|
||||
while (smopt) {
|
||||
auto mfopt = (*smopt)().get0();
|
||||
while (mfopt) {
|
||||
if (mfopt->is_clustering_row()) {
|
||||
nrows++;
|
||||
}
|
||||
mfopt = (*smopt)().get0();
|
||||
}
|
||||
smopt = reader.read().get0();
|
||||
}
|
||||
return nrows;
|
||||
});
|
||||
}
|
||||
|
||||
// This test reads, using sstable::read_row(), a slice (a range of clustering
|
||||
// rows) from one large partition in an sstable written in Cassandra.
|
||||
// This large partition includes 13520 clustering rows, and spans about
|
||||
// 700 KB on disk. When we ask to read only a part of it, the promoted index
|
||||
// (included in this sstable) may be used to allow reading only a part of the
|
||||
// partition from disk. This test doesn't directly verify that the promoted
|
||||
// index is actually used - and can work even without a promoted index
|
||||
// support - but can be used to check that adding promoted index read supports
|
||||
// did not break anything.
|
||||
// To verify that the promoted index was actually used to reduce the size
|
||||
// of read from disk, add printouts to the row reading code.
|
||||
SEASTAR_TEST_CASE(sub_partition_read) {
|
||||
schema_ptr s = large_partition_schema();
|
||||
return load_large_partition_sst().then([s] (auto sstp) {
|
||||
return count_rows(sstp, s, "v1", "18wX", "18xB").then([] (int nrows) {
|
||||
// there should be 5 rows (out of 13520 = 20*26*26) in this range:
|
||||
// 18wX, 18wY, 18wZ, 18xA, 18xB.
|
||||
BOOST_REQUIRE(nrows == 5);
|
||||
}).then([sstp, s] () {
|
||||
return count_rows(sstp, s, "v1", "13aB", "15aA").then([] (int nrows) {
|
||||
// There should be 26*26*2 rows in this range. It spans two
|
||||
// promoted-index blocks, so we get to test that case.
|
||||
BOOST_REQUIRE(nrows == 2*26*26);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
return count_rows(sstp, s, "v1", "10aB", "19aA").then([] (int nrows) {
|
||||
// There should be 26*26*9 rows in this range. It spans many
|
||||
// promoted-index blocks.
|
||||
BOOST_REQUIRE(nrows == 9*26*26);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
return count_rows(sstp, s, "v1", "0", "z").then([] (int nrows) {
|
||||
// All rows, 20*26*26 of them, are in this range. It spans all
|
||||
// the promoted-index blocks, but the range is still bounded
|
||||
// on both sides
|
||||
BOOST_REQUIRE(nrows == 20*26*26);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
// range that is outside (after) the actual range of the data.
|
||||
// No rows should match.
|
||||
return count_rows(sstp, s, "v1", "y", "z").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == 0);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
// range that is outside (before) the actual range of the data.
|
||||
// No rows should match.
|
||||
return count_rows(sstp, s, "v1", "_a", "_b").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == 0);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
// half-infinite range
|
||||
return count_rows(sstp, s, "v1", "", "10aA").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == (1*26*26 + 1));
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
// half-infinite range
|
||||
return count_rows(sstp, s, "v1", "10aA", "").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == 19*26*26);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
// count all rows, but giving an explicit all-encompasing filter
|
||||
return count_rows(sstp, s, "v1", "", "").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == 20*26*26);
|
||||
});
|
||||
}).then([sstp, s] () {
|
||||
// count all rows, without a filter
|
||||
return count_rows(sstp, s, "v1").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == 20*26*26);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Same as previous test, just using read_range_rows instead of read_row
|
||||
// to read parts of potentially more than one partition (in this particular
|
||||
// sstable, there is actually just one partition).
|
||||
SEASTAR_TEST_CASE(sub_partitions_read) {
|
||||
schema_ptr s = large_partition_schema();
|
||||
return load_large_partition_sst().then([s] (auto sstp) {
|
||||
return count_rows(sstp, s, "18wX", "18xB").then([] (int nrows) {
|
||||
BOOST_REQUIRE(nrows == 5);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// A silly, inefficient but effective, way to compare two files by reading
|
||||
// them entirely into memory.
|
||||
static future<> compare_files(sstring file1, sstring file2) {
|
||||
return read_file(file1).then([file2] (auto in1) {
|
||||
return read_file(file2).then([in1 = std::move(in1)] (auto in2) {
|
||||
// assert that both files have the same size.
|
||||
BOOST_REQUIRE(in1.second == in2.second);
|
||||
// assert that both files have the same content.
|
||||
BOOST_REQUIRE(::memcmp(in1.first.get(), in2.first.get(), in1.second) == 0);
|
||||
});
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
// This test creates the same data as we previously created with Cassandra
|
||||
// in the tests/sstables/large_partition directory (which we read in the
|
||||
// promoted_index_read test above). The index file in both sstables - which
|
||||
// includes the promoted index - should be bit-for-bit identical, otherwise
|
||||
// we have a problem in our promoted index writing code (or in the data
|
||||
// writing code, because the promoted index points to offsets in the data).
|
||||
SEASTAR_TEST_CASE(promoted_index_write) {
|
||||
return test_setup::do_with_test_directory([] {
|
||||
auto s = large_partition_schema();
|
||||
auto mtp = make_lw_shared<memtable>(s);
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes("v1")});
|
||||
mutation m(key, s);
|
||||
auto col = s->get_column_definition("t3");
|
||||
BOOST_REQUIRE(col && !col->is_static());
|
||||
for (char i = 'a'; i <= 'z'; i++) {
|
||||
for (char j = 'A'; j <= 'Z'; j++) {
|
||||
for (int k = 0; k < 20; k++) {
|
||||
auto& row = m.partition().clustered_row(
|
||||
clustering_key::from_exploded(
|
||||
*s, {to_bytes(sprint("%d%c%c", k, i, j))}));
|
||||
row.cells().apply(*col,
|
||||
atomic_cell::make_live(2345,
|
||||
col->type->decompose(sstring(sprint("z%c",i)))));
|
||||
row.apply(row_marker(1234));
|
||||
}
|
||||
}
|
||||
}
|
||||
mtp->apply(std::move(m));
|
||||
auto sst = make_lw_shared<sstable>("try1", "data",
|
||||
"tests/sstables/tests-temporary", 100,
|
||||
sstables::sstable::version_types::ka, big);
|
||||
return sst->write_components(*mtp).then([s] {
|
||||
return compare_files(
|
||||
"tests/sstables/large_partition/try1-data-ka-3-Index.db",
|
||||
"tests/sstables/tests-temporary/try1-data-ka-100-Index.db");
|
||||
}).then([sst, mtp] {});
|
||||
});
|
||||
}
|
||||
|
||||
BIN
tests/sstables/large_partition/try1-data-ka-3-CompressionInfo.db
Normal file
BIN
tests/sstables/large_partition/try1-data-ka-3-CompressionInfo.db
Normal file
Binary file not shown.
BIN
tests/sstables/large_partition/try1-data-ka-3-Data.db
Normal file
BIN
tests/sstables/large_partition/try1-data-ka-3-Data.db
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 263 KiB |
@@ -0,0 +1 @@
|
||||
2833048369
|
||||
BIN
tests/sstables/large_partition/try1-data-ka-3-Filter.db
Normal file
BIN
tests/sstables/large_partition/try1-data-ka-3-Filter.db
Normal file
Binary file not shown.
BIN
tests/sstables/large_partition/try1-data-ka-3-Index.db
Normal file
BIN
tests/sstables/large_partition/try1-data-ka-3-Index.db
Normal file
Binary file not shown.
BIN
tests/sstables/large_partition/try1-data-ka-3-Statistics.db
Normal file
BIN
tests/sstables/large_partition/try1-data-ka-3-Statistics.db
Normal file
Binary file not shown.
BIN
tests/sstables/large_partition/try1-data-ka-3-Summary.db
Normal file
BIN
tests/sstables/large_partition/try1-data-ka-3-Summary.db
Normal file
Binary file not shown.
8
tests/sstables/large_partition/try1-data-ka-3-TOC.txt
Normal file
8
tests/sstables/large_partition/try1-data-ka-3-TOC.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
Data.db
|
||||
CompressionInfo.db
|
||||
Index.db
|
||||
Summary.db
|
||||
Statistics.db
|
||||
Digest.sha1
|
||||
TOC.txt
|
||||
Filter.db
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user