Refs #18161 Yet another approach to dealing with large commitlog submissions. We handle oversize single mutation by adding yet another entry type: fragmented. In this case we only add a fragment (aha) of the data that needs storing into each entry, along with metadata to correlate and reconstruct the full entry on replay. Because these fragmented entries are spread over N segments, we also need to add references from the first segment in a chain to the subsequent ones. These are released once we clear the relevant cf_id count in the base. * This approach has the downside that due to how serialization etc works w.r.t. mutations, we need to create an intermediate buffer to hold the full serialized target entry. This is then incrementally written into entries of < max_mutation_size, successively requesting more segments. On replay, when encountering a fragment chain, the fragment is added to a "state", i.e. a mapping of currently processing frag chains. Once we've found all fragments and concatenated the buffers into a single fragmented one, we can issue a replay callback as usual. Note that a replay caller will need to create and provide such a state object. Old signature replay function remains for tests and such. This approach bumps the file format (docs to come). To ensure "atomicity" we both force syncronization, and should the whole op fail, we restore segment state (rewinding), thus discarding data all we wrote. v2: * Improve some bookeep, ensure we keep track of segments and flush properly, to get counter correct
142 lines
3.9 KiB
C++
142 lines
3.9 KiB
C++
/*
|
|
* Copyright 2016-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "utils/assert.hh"
|
|
#include <optional>
|
|
|
|
#include "commitlog_types.hh"
|
|
#include "mutation/frozen_mutation.hh"
|
|
#include "schema/schema_fwd.hh"
|
|
#include "replay_position.hh"
|
|
|
|
namespace detail {
|
|
|
|
using buffer_type = fragmented_temporary_buffer;
|
|
using base_iterator = typename std::vector<temporary_buffer<char>>::const_iterator;
|
|
|
|
static constexpr auto sector_overhead_size = sizeof(uint32_t) + sizeof(db::segment_id_type);
|
|
|
|
// iterator adaptor to enable splitting normal
|
|
// frag-buffer temporary buffer objects into
|
|
// sub-disk-page sized chunks.
|
|
class sector_split_iterator {
|
|
base_iterator _iter, _end;
|
|
char* _ptr;
|
|
size_t _size;
|
|
size_t _sector_size;
|
|
public:
|
|
sector_split_iterator(const sector_split_iterator&) noexcept;
|
|
sector_split_iterator(base_iterator i, base_iterator e, size_t sector_size);
|
|
sector_split_iterator(base_iterator i, base_iterator e, size_t sector_size, size_t overhead);
|
|
sector_split_iterator();
|
|
|
|
char* get_write() const {
|
|
return _ptr;
|
|
}
|
|
size_t size() const {
|
|
return _size;
|
|
}
|
|
char* begin() {
|
|
return _ptr;
|
|
}
|
|
char* end() {
|
|
return _ptr + _size;
|
|
}
|
|
const char* begin() const {
|
|
return _ptr;
|
|
}
|
|
const char* end() const {
|
|
return _ptr + _size;
|
|
}
|
|
|
|
bool operator==(const sector_split_iterator& rhs) const {
|
|
return _iter == rhs._iter && _ptr == rhs._ptr;
|
|
}
|
|
|
|
auto& operator*() const {
|
|
return *this;
|
|
}
|
|
auto* operator->() const {
|
|
return this;
|
|
}
|
|
|
|
sector_split_iterator& operator++();
|
|
sector_split_iterator operator++(int);
|
|
};
|
|
}
|
|
|
|
class commitlog_entry {
|
|
std::optional<column_mapping> _mapping;
|
|
frozen_mutation _mutation;
|
|
public:
|
|
commitlog_entry(std::optional<column_mapping> mapping, frozen_mutation&& mutation)
|
|
: _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
|
|
const std::optional<column_mapping>& mapping() const { return _mapping; }
|
|
const frozen_mutation& mutation() const & { return _mutation; }
|
|
frozen_mutation&& mutation() && { return std::move(_mutation); }
|
|
};
|
|
|
|
class commitlog_entry_writer {
|
|
public:
|
|
using force_sync = db::commitlog_force_sync;
|
|
private:
|
|
schema_ptr _schema;
|
|
const frozen_mutation& _mutation;
|
|
bool _with_schema = true;
|
|
size_t _size = std::numeric_limits<size_t>::max();
|
|
force_sync _sync;
|
|
private:
|
|
template<typename Output>
|
|
void serialize(Output&) const;
|
|
void compute_size();
|
|
public:
|
|
commitlog_entry_writer(schema_ptr s, const frozen_mutation& fm, force_sync sync)
|
|
: _schema(std::move(s)), _mutation(fm), _sync(sync)
|
|
{}
|
|
|
|
void set_with_schema(bool value) {
|
|
if (std::exchange(_with_schema, value) != value || _size == std::numeric_limits<size_t>::max()) {
|
|
compute_size();
|
|
}
|
|
}
|
|
bool with_schema() const {
|
|
return _with_schema;
|
|
}
|
|
schema_ptr schema() const {
|
|
return _schema;
|
|
}
|
|
|
|
size_t size() const {
|
|
SCYLLA_ASSERT(_size != std::numeric_limits<size_t>::max());
|
|
return _size;
|
|
}
|
|
|
|
size_t mutation_size() const {
|
|
return _mutation.representation().size();
|
|
}
|
|
force_sync sync() const {
|
|
return _sync;
|
|
}
|
|
|
|
using ostream = typename seastar::memory_output_stream<detail::sector_split_iterator>;
|
|
|
|
void write(ostream& out) const;
|
|
};
|
|
|
|
class commitlog_entry_reader {
|
|
commitlog_entry _ce;
|
|
public:
|
|
commitlog_entry_reader(const fragmented_temporary_buffer& buffer);
|
|
|
|
const std::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
|
|
const frozen_mutation& mutation() const & { return _ce.mutation(); }
|
|
frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
|
|
};
|