mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-22 15:52:13 +00:00
Split the `log_record` to `log_record_header` type that has the record
metadata fields and the mutation as a separate field which is the actual
record data:
struct log_record {
log_record_header header;
canonical_mutation mut;
};
Both the header and mutation have variable serialized size. When a
record is serialized in a write_buffer, we first put a small
`record_header` that has the header size and data size, then the
serialized header and data follow. The `log_location` of a record points
to the beginning of the `record_header`, and the size includes the
`record_header`.
This allows us to read a record header without reading the data when
it's not needed and avoid deserializing it:
* on recovery, when scanning all segments, we read only the record
headers.
* on compaction, we read the record header first to determine if the
record is alive, if yes then we read the data.
Closes scylladb/scylladb#29457
153 lines
4.4 KiB
C++
153 lines
4.4 KiB
C++
/*
|
|
* Copyright (C) 2026-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
|
*/
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <filesystem>
|
|
#include <seastar/core/shared_future.hh>
|
|
#include <seastar/core/file.hh>
|
|
#include <seastar/core/rwlock.hh>
|
|
#include <seastar/core/fstream.hh>
|
|
#include <seastar/core/gate.hh>
|
|
#include <seastar/core/queue.hh>
|
|
#include <seastar/core/shared_ptr.hh>
|
|
#include <seastar/util/bool_class.hh>
|
|
#include "bytes_fwd.hh"
|
|
#include "mutation_writer/token_group_based_splitting_writer.hh"
|
|
#include "replica/logstor/write_buffer.hh"
|
|
#include "replica/logstor/compaction.hh"
|
|
#include "types.hh"
|
|
#include "utils/updateable_value.hh"
|
|
|
|
namespace replica {
|
|
|
|
class database;
|
|
class table;
|
|
|
|
namespace logstor {
|
|
|
|
using want_data = seastar::bool_class<class want_data_tag>;
|
|
|
|
class compaction_manager;
|
|
class segment_set;
|
|
class primary_index;
|
|
|
|
static constexpr size_t default_segment_size = 128 * 1024;
|
|
static constexpr size_t default_file_size = 32 * 1024 * 1024;
|
|
|
|
/// Configuration for the segment manager
|
|
struct segment_manager_config {
|
|
std::filesystem::path base_dir;
|
|
size_t segment_size = default_segment_size;
|
|
size_t file_size = default_file_size;
|
|
size_t disk_size;
|
|
bool compaction_enabled = true;
|
|
size_t max_segments_per_compaction = 8;
|
|
seastar::scheduling_group compaction_sg;
|
|
utils::updateable_value<float> compaction_static_shares;
|
|
seastar::scheduling_group separator_sg;
|
|
uint32_t separator_delay_limit_ms;
|
|
size_t max_separator_memory = 1 * 1024 * 1024;
|
|
};
|
|
|
|
struct table_segment_histogram_bucket {
|
|
size_t count;
|
|
size_t max_data_size;
|
|
|
|
table_segment_histogram_bucket& operator+=(table_segment_histogram_bucket& other) {
|
|
count += other.count;
|
|
max_data_size = std::max(max_data_size, other.max_data_size);
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
struct table_segment_stats {
|
|
size_t compaction_group_count{0};
|
|
size_t segment_count{0};
|
|
std::vector<table_segment_histogram_bucket> histogram;
|
|
|
|
table_segment_stats& operator+=(table_segment_stats& other) {
|
|
compaction_group_count += other.compaction_group_count;
|
|
segment_count += other.segment_count;
|
|
histogram.resize(std::max(histogram.size(), other.histogram.size()));
|
|
for (size_t i = 0; i < other.histogram.size(); i++) {
|
|
histogram[i] += other.histogram[i];
|
|
}
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
struct segment_snapshot {
|
|
log_segment_id segment_id;
|
|
segment_ref seg_ref;
|
|
noncopyable_function<future<seastar::input_stream<char>>(const file_input_stream_options&)> source;
|
|
};
|
|
|
|
class segment_stream_sink {
|
|
public:
|
|
virtual ~segment_stream_sink() = default;
|
|
virtual log_segment_id segment_id() const noexcept = 0;
|
|
virtual future<output_stream<char>> output() = 0;
|
|
virtual future<> close() = 0;
|
|
virtual future<> abort() = 0;
|
|
};
|
|
|
|
class segment_manager_impl;
|
|
class log_index;
|
|
|
|
class segment_manager {
|
|
std::unique_ptr<segment_manager_impl> _impl;
|
|
private:
|
|
segment_manager_impl& get_impl() noexcept;
|
|
const segment_manager_impl& get_impl() const noexcept;
|
|
public:
|
|
static constexpr size_t block_alignment = 4096;
|
|
|
|
explicit segment_manager(segment_manager_config config);
|
|
~segment_manager();
|
|
|
|
segment_manager(const segment_manager&) = delete;
|
|
segment_manager& operator=(const segment_manager&) = delete;
|
|
|
|
future<> do_recovery(replica::database&);
|
|
|
|
future<> start();
|
|
future<> stop();
|
|
|
|
future<> write(write_buffer& wb);
|
|
|
|
future<log_record> read(log_location location);
|
|
|
|
void free_record(log_location location);
|
|
|
|
compaction_manager& get_compaction_manager() noexcept;
|
|
const compaction_manager& get_compaction_manager() const noexcept;
|
|
|
|
void set_trigger_compaction_hook(std::function<void()> fn);
|
|
void set_trigger_separator_flush_hook(std::function<void(size_t)> fn);
|
|
|
|
size_t get_segment_size() const noexcept;
|
|
|
|
future<> discard_segments(segment_set&);
|
|
|
|
size_t get_memory_usage() const;
|
|
|
|
future<> await_pending_writes();
|
|
|
|
future<utils::chunked_vector<segment_snapshot>> make_snapshot(compaction_group& cg);
|
|
|
|
// Create an output stream to write a segment (for receiving from remote node)
|
|
// Allocates a new local segment and returns an output stream for writing to the segment.
|
|
future<std::unique_ptr<segment_stream_sink>> create_segment_output_stream(replica::database&);
|
|
|
|
friend class segment_manager_impl;
|
|
|
|
};
|
|
|
|
}
|
|
} |