Files
scylladb/mutation/frozen_mutation.cc
Benny Halevy 3feb759943 everywhere: use utils::chunked_vector for list of mutations
Currently, we use std::vector<*mutation> to keep
a list of mutations for processing.
This can lead to large allocation, e.g. when the vector
size is a function of the number of tables.

Use a chunked vector instead to prevent oversized allocations.

`perf-simple-query --smp 1` results obtained for fixed 400MHz frequency
and PGO disabled:

Before (read path):
```
enable-cache=1
Running test with config: {partitions=10000, concurrency=100, mode=read, query_single_key=no, counters=no}
Disabling auto compaction
Creating 10000 partitions...

89055.97 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39417 insns/op,   18003 cycles/op,        0 errors)
103372.72 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39380 insns/op,   17300 cycles/op,        0 errors)
98942.27 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39413 insns/op,   17336 cycles/op,        0 errors)
103752.93 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39407 insns/op,   17252 cycles/op,        0 errors)
102516.77 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39403 insns/op,   17288 cycles/op,        0 errors)
throughput:
	mean=   99528.13 standard-deviation=6155.71
	median= 102516.77 median-absolute-deviation=3844.59
	maximum=103752.93 minimum=89055.97
instructions_per_op:
	mean=   39403.99 standard-deviation=14.25
	median= 39406.75 median-absolute-deviation=9.30
	maximum=39416.63 minimum=39380.39
cpu_cycles_per_op:
	mean=   17435.81 standard-deviation=318.24
	median= 17300.40 median-absolute-deviation=147.59
	maximum=18002.53 minimum=17251.75
```

After (read path)
```
enable-cache=1
Running test with config: {partitions=10000, concurrency=100, mode=read, query_single_key=no, counters=no}
Disabling auto compaction
Creating 10000 partitions...
59755.04 tps ( 66.2 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39466 insns/op,   22834 cycles/op,        0 errors)
71854.16 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39417 insns/op,   17883 cycles/op,        0 errors)
82149.45 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.2 tasks/op,   39411 insns/op,   17409 cycles/op,        0 errors)
49640.04 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.3 tasks/op,   39474 insns/op,   19975 cycles/op,        0 errors)
54963.22 tps ( 66.1 allocs/op,   0.0 logallocs/op,  14.3 tasks/op,   39474 insns/op,   18235 cycles/op,        0 errors)
throughput:
	mean=   63672.38 standard-deviation=13195.12
	median= 59755.04 median-absolute-deviation=8709.16
	maximum=82149.45 minimum=49640.04
instructions_per_op:
	mean=   39448.38 standard-deviation=31.60
	median= 39466.17 median-absolute-deviation=25.75
	maximum=39474.12 minimum=39411.42
cpu_cycles_per_op:
	mean=   19267.01 standard-deviation=2217.03
	median= 18234.80 median-absolute-deviation=1384.25
	maximum=22834.26 minimum=17408.67
```

`perf-simple-query --smp 1 --write` results obtained for fixed 400MHz frequency
and PGO disabled:

Before (write path):
```
enable-cache=1
Running test with config: {partitions=10000, concurrency=100, mode=write, query_single_key=no, counters=no}
Disabling auto compaction
63736.96 tps ( 59.4 allocs/op,  16.4 logallocs/op,  14.3 tasks/op,   49667 insns/op,   19924 cycles/op,        0 errors)
64109.41 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   49992 insns/op,   20084 cycles/op,        0 errors)
56950.47 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50005 insns/op,   20501 cycles/op,        0 errors)
44858.42 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50014 insns/op,   21947 cycles/op,        0 errors)
28592.87 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50027 insns/op,   27659 cycles/op,        0 errors)
throughput:
	mean=   51649.63 standard-deviation=15059.74
	median= 56950.47 median-absolute-deviation=12087.33
	maximum=64109.41 minimum=28592.87
instructions_per_op:
	mean=   49941.18 standard-deviation=153.76
	median= 50005.24 median-absolute-deviation=73.01
	maximum=50027.07 minimum=49667.05
cpu_cycles_per_op:
	mean=   22023.01 standard-deviation=3249.92
	median= 20500.74 median-absolute-deviation=1938.76
	maximum=27658.75 minimum=19924.32
```

After (write path)
```
enable-cache=1
Running test with config: {partitions=10000, concurrency=100, mode=write, query_single_key=no, counters=no}
Disabling auto compaction
53395.93 tps ( 59.4 allocs/op,  16.5 logallocs/op,  14.3 tasks/op,   50326 insns/op,   21252 cycles/op,        0 errors)
46527.83 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50704 insns/op,   21555 cycles/op,        0 errors)
55846.30 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50731 insns/op,   21060 cycles/op,        0 errors)
55669.30 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50735 insns/op,   21521 cycles/op,        0 errors)
52130.17 tps ( 59.3 allocs/op,  16.0 logallocs/op,  14.3 tasks/op,   50757 insns/op,   21334 cycles/op,        0 errors)
throughput:
	mean=   52713.91 standard-deviation=3795.38
	median= 53395.93 median-absolute-deviation=2955.40
	maximum=55846.30 minimum=46527.83
instructions_per_op:
	mean=   50650.57 standard-deviation=182.46
	median= 50731.38 median-absolute-deviation=84.09
	maximum=50756.62 minimum=50325.87
cpu_cycles_per_op:
	mean=   21344.42 standard-deviation=202.86
	median= 21334.00 median-absolute-deviation=176.37
	maximum=21554.61 minimum=21060.24
```

Fixes #24815

Improvement for rare corner cases. No backport required

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>

Closes scylladb/scylladb#24919
2025-07-13 19:13:11 +03:00

279 lines
8.8 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <seastar/core/coroutine.hh>
#include "frozen_mutation.hh"
#include "schema/schema_registry.hh"
#include "mutation_partition.hh"
#include "mutation.hh"
#include "counters.hh"
#include "partition_builder.hh"
#include "mutation_partition_serializer.hh"
#include "query-result-set.hh"
#include "idl/mutation.dist.hh"
#include "idl/mutation.dist.impl.hh"
#include "readers/mutation_reader.hh"
#include "converting_mutation_partition_applier.hh"
#include "mutation_partition_view.hh"
//
// Representation layout:
//
// <mutation> ::= <column-family-id> <schema-version> <partition-key> <partition>
//
using namespace db;
ser::mutation_view frozen_mutation::mutation_view() const {
auto in = ser::as_input_stream(_bytes);
return ser::deserialize(in, std::type_identity<ser::mutation_view>());
}
table_id
frozen_mutation::column_family_id() const {
return mutation_view().table_id();
}
table_schema_version
frozen_mutation::schema_version() const {
return mutation_view().schema_version();
}
partition_key_view
frozen_mutation::key() const {
return _pk;
}
dht::decorated_key
frozen_mutation::decorated_key(const schema& s) const {
return dht::decorate_key(s, key());
}
partition_key frozen_mutation::deserialize_key() const {
return mutation_view().key();
}
frozen_mutation::frozen_mutation(bytes_ostream&& b)
: _bytes(std::move(b))
, _pk(deserialize_key())
{
_bytes.reduce_chunk_count();
}
frozen_mutation::frozen_mutation(bytes_ostream&& b, partition_key pk)
: _bytes(std::move(b))
, _pk(std::move(pk))
{
_bytes.reduce_chunk_count();
}
frozen_mutation::frozen_mutation(const mutation& m)
: _pk(m.key())
{
mutation_partition_serializer part_ser(*m.schema(), m.partition());
ser::writer_of_mutation<bytes_ostream> wom(_bytes);
std::move(wom).write_table_id(m.schema()->id())
.write_schema_version(m.schema()->version())
.write_key(m.key())
.partition([&] (auto wr) {
part_ser.write(std::move(wr));
}).end_mutation();
_bytes.reduce_chunk_count();
}
mutation
frozen_mutation::unfreeze(schema_ptr schema) const {
check_schema_version(schema_version(), *schema);
mutation m(schema, key());
partition_builder b(*schema, m.partition());
try {
partition().accept(*schema, b);
} catch (...) {
std::throw_with_nested(std::runtime_error(format(
"frozen_mutation::unfreeze(): failed unfreezing mutation {} of {}.{}", key(), schema->ks_name(), schema->cf_name())));
}
return m;
}
mutation frozen_mutation::unfreeze_upgrading(schema_ptr schema, const column_mapping& cm) const {
mutation m(schema, key());
converting_mutation_partition_applier v(cm, *schema, m.partition());
try {
partition().accept(cm, v);
} catch (...) {
std::throw_with_nested(std::runtime_error(format(
"frozen_mutation::unfreeze_upgrading(): failed unfreezing mutation {} of {}.{}", key(), schema->ks_name(), schema->cf_name())));
}
return m;
}
frozen_mutation freeze(const mutation& m) {
return frozen_mutation{ m };
}
utils::chunked_vector<frozen_mutation> freeze(const utils::chunked_vector<mutation>& muts) {
return muts | std::views::transform([] (const mutation& m) {
return freeze(m);
}) | std::ranges::to<utils::chunked_vector<frozen_mutation>>();
}
utils::chunked_vector<mutation> unfreeze(const utils::chunked_vector<frozen_mutation>& muts) {
return muts | std::views::transform([] (const frozen_mutation& fm) {
return fm.unfreeze(local_schema_registry().get(fm.schema_version()));
}) | std::ranges::to<utils::chunked_vector<mutation>>();
}
mutation_partition_view frozen_mutation::partition() const {
return mutation_partition_view::from_view(mutation_view().partition());
}
frozen_mutation::printer frozen_mutation::pretty_printer(schema_ptr s) const {
return { *this, std::move(s) };
}
stop_iteration streamed_mutation_freezer::consume(tombstone pt) {
_partition_tombstone = pt;
return stop_iteration::no;
}
stop_iteration streamed_mutation_freezer::consume(static_row&& sr) {
_sr = std::move(sr);
return stop_iteration::no;
}
stop_iteration streamed_mutation_freezer::consume(clustering_row&& cr) {
_crs.emplace_back(std::move(cr));
return stop_iteration::no;
}
stop_iteration streamed_mutation_freezer::consume(range_tombstone&& rt) {
_rts.apply(_schema, std::move(rt));
return stop_iteration::no;
}
frozen_mutation streamed_mutation_freezer::consume_end_of_stream() {
bytes_ostream out;
ser::writer_of_mutation<bytes_ostream> wom(out);
std::move(wom).write_table_id(_schema.id())
.write_schema_version(_schema.version())
.write_key(_key)
.partition([&] (auto wr) {
serialize_mutation_fragments(_schema, _partition_tombstone,
std::move(_sr), std::move(_rts),
std::move(_crs), std::move(wr));
}).end_mutation();
return frozen_mutation(std::move(out), std::move(_key));
}
class fragmenting_mutation_freezer {
const schema& _schema;
std::optional<partition_key> _key;
tombstone _partition_tombstone;
std::optional<static_row> _sr;
std::deque<clustering_row> _crs;
range_tombstone_list _rts;
frozen_mutation_consumer_fn _consumer;
bool _fragmented = false;
size_t _dirty_size = 0;
size_t _fragment_size;
range_tombstone_change _current_rtc;
private:
future<stop_iteration> flush() {
bytes_ostream out;
ser::writer_of_mutation<bytes_ostream> wom(out);
std::move(wom).write_table_id(_schema.id())
.write_schema_version(_schema.version())
.write_key(*_key)
.partition([&] (auto wr) {
serialize_mutation_fragments(_schema, _partition_tombstone,
std::move(_sr), std::move(_rts),
std::move(_crs), std::move(wr));
}).end_mutation();
_sr = { };
_rts.clear();
_crs.clear();
_dirty_size = 0;
return _consumer(frozen_mutation(std::move(out), *_key), _fragmented);
}
future<stop_iteration> maybe_flush() {
if (_dirty_size >= _fragment_size) {
_fragmented = true;
return flush();
}
return make_ready_future<stop_iteration>(stop_iteration::no);
}
public:
fragmenting_mutation_freezer(const schema& s, frozen_mutation_consumer_fn c, size_t fragment_size)
: _schema(s), _rts(s), _consumer(c), _fragment_size(fragment_size), _current_rtc(position_in_partition::before_all_clustered_rows(), {}) { }
future<stop_iteration> consume(partition_start&& ps) {
_key = std::move(ps.key().key());
_fragmented = false;
_dirty_size += sizeof(tombstone);
_partition_tombstone = ps.partition_tombstone();
return make_ready_future<stop_iteration>(stop_iteration::no);
}
future<stop_iteration> consume(static_row&& sr) {
_sr = std::move(sr);
_dirty_size += _sr->memory_usage(_schema);
return maybe_flush();
}
future<stop_iteration> consume(clustering_row&& cr) {
_dirty_size += cr.memory_usage(_schema);
_crs.emplace_back(std::move(cr));
return maybe_flush();
}
future<stop_iteration> consume(range_tombstone_change&& rtc) {
auto ret = make_ready_future<stop_iteration>(stop_iteration::no);
if (_current_rtc.tombstone()) {
auto rt = range_tombstone(_current_rtc.position(), rtc.position(), _current_rtc.tombstone());
_dirty_size += rt.memory_usage(_schema);
_rts.apply(_schema, std::move(rt));
ret = maybe_flush();
}
_current_rtc = std::move(rtc);
return ret;
}
future<stop_iteration> consume(partition_end&&) {
if (_dirty_size) {
return flush();
}
return make_ready_future<stop_iteration>(stop_iteration::no);
}
};
future<> fragment_and_freeze(mutation_reader mr, frozen_mutation_consumer_fn c, size_t fragment_size)
{
std::exception_ptr ex;
try {
fragmenting_mutation_freezer freezer(*mr.schema(), c, fragment_size);
mutation_fragment_v2_opt mfopt;
while ((mfopt = co_await mr()) && (co_await std::move(*mfopt).consume(freezer) == stop_iteration::no));
} catch (...) {
ex = std::current_exception();
}
co_await mr.close();
if (ex) {
std::rethrow_exception(std::move(ex));
}
}