/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Modified by ScyllaDB
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "seastarx.hh"
#include "commitlog.hh"
#include "rp_set.hh"
#include "db/config.hh"
#include "db/extensions.hh"
#include "utils/data_input.hh"
#include "utils/crc.hh"
#include "utils/runtime.hh"
#include "utils/flush_queue.hh"
#include "log.hh"
#include "commitlog_entry.hh"
#include "commitlog_extensions.hh"
#include "service/priority_manager.hh"
#include
#include
#include "checked-file-impl.hh"
#include "disk-error-handler.hh"
static logging::logger clogger("commitlog");
using namespace std::chrono_literals;
class crc32_nbo {
utils::crc32 _c;
public:
template
void process(T t) {
_c.process_be(t);
}
uint32_t checksum() const {
return _c.get();
}
void process_bytes(const uint8_t* data, size_t size) {
return _c.process(data, size);
}
void process_bytes(const int8_t* data, size_t size) {
return _c.process(reinterpret_cast(data), size);
}
void process_bytes(const char* data, size_t size) {
return _c.process(reinterpret_cast(data), size);
}
template
GCC6_CONCEPT(requires FragmentRange)
void process_fragmented(const FragmentedBuffer& buffer) {
return _c.process_fragmented(buffer);
}
};
class db::cf_holder {
public:
virtual ~cf_holder() {};
virtual void release_cf_count(const cf_id_type&) = 0;
};
db::commitlog::config db::commitlog::config::from_db_config(const db::config& cfg, size_t shard_available_memory) {
config c;
c.commit_log_location = cfg.commitlog_directory();
c.metrics_category_name = "commitlog";
c.commitlog_total_space_in_mb = cfg.commitlog_total_space_in_mb() >= 0 ? cfg.commitlog_total_space_in_mb() : (shard_available_memory * smp::count) >> 20;
c.commitlog_segment_size_in_mb = cfg.commitlog_segment_size_in_mb();
c.commitlog_sync_period_in_ms = cfg.commitlog_sync_period_in_ms();
c.mode = cfg.commitlog_sync() == "batch" ? sync_mode::BATCH : sync_mode::PERIODIC;
c.extensions = &cfg.extensions();
return c;
}
db::commitlog::descriptor::descriptor(segment_id_type i, const std::string& fname_prefix, uint32_t v)
: id(i), ver(v), filename_prefix(fname_prefix) {
}
db::commitlog::descriptor::descriptor(replay_position p, const std::string& fname_prefix)
: descriptor(p.id, fname_prefix) {
}
db::commitlog::descriptor::descriptor(std::pair p, const std::string& fname_prefix)
: descriptor(p.first, fname_prefix, p.second) {
}
db::commitlog::descriptor::descriptor(const sstring& filename, const std::string& fname_prefix)
: descriptor([&filename, &fname_prefix]() {
std::smatch m;
// match both legacy and new version of commitlogs Ex: CommitLog-12345.log and CommitLog-4-12345.log.
std::regex rx("(?:.*/)?" + fname_prefix + "((\\d+)(" + SEPARATOR + "\\d+)?)" + FILENAME_EXTENSION);
std::string sfilename = filename;
if (!std::regex_match(sfilename, m, rx)) {
throw std::domain_error("Cannot parse the version of the file: " + filename);
}
if (m[3].length() == 0) {
// CMH. Can most likely ignore this
throw std::domain_error("Commitlog segment is too old to open; upgrade to 1.2.5+ first");
}
segment_id_type id = std::stoull(m[3].str().substr(1));
uint32_t ver = std::stoul(m[2].str());
return std::make_pair(id, ver);
}(), fname_prefix) {
}
sstring db::commitlog::descriptor::filename() const {
return filename_prefix + std::to_string(ver) + SEPARATOR
+ std::to_string(id) + FILENAME_EXTENSION;
}
db::commitlog::descriptor::operator db::replay_position() const {
return replay_position(id);
}
const std::string db::commitlog::descriptor::SEPARATOR("-");
const std::string db::commitlog::descriptor::FILENAME_PREFIX(
"CommitLog" + SEPARATOR);
const std::string db::commitlog::descriptor::FILENAME_EXTENSION(".log");
class db::commitlog::segment_manager : public ::enable_shared_from_this {
public:
config cfg;
std::vector _segments_to_replay;
const uint64_t max_size;
const uint64_t max_mutation_size;
// Divide the size-on-disk threshold by #cpus used, since we assume
// we distribute stuff more or less equally across shards.
const uint64_t max_disk_size; // per-shard
bool _shutdown = false;
std::experimental::optional> _shutdown_promise = {};
// Allocation must throw timed_out_error by contract.
using timeout_exception_factory = default_timeout_exception_factory;
basic_semaphore _flush_semaphore;
seastar::metrics::metric_groups _metrics;
// TODO: verify that we're ok with not-so-great granularity
using clock_type = lowres_clock;
using time_point = clock_type::time_point;
using sseg_ptr = ::shared_ptr;
using request_controller_type = basic_semaphore;
using request_controller_units = semaphore_units;
request_controller_type _request_controller;
stdx::optional>> _segment_allocating;
std::unordered_map _files_to_delete;
std::vector _files_to_close;
void account_memory_usage(size_t size) {
_request_controller.consume(size);
}
void notify_memory_written(size_t size) {
_request_controller.signal(size);
}
future
allocate_when_possible(const cf_id_type& id, shared_ptr writer, db::timeout_clock::time_point timeout);
struct stats {
uint64_t cycle_count = 0;
uint64_t flush_count = 0;
uint64_t allocation_count = 0;
uint64_t bytes_written = 0;
uint64_t bytes_slack = 0;
uint64_t segments_created = 0;
uint64_t segments_destroyed = 0;
uint64_t pending_flushes = 0;
uint64_t flush_limit_exceeded = 0;
uint64_t total_size = 0;
uint64_t buffer_list_bytes = 0;
uint64_t total_size_on_disk = 0;
uint64_t requests_blocked_memory = 0;
};
stats totals;
size_t pending_allocations() const {
return _request_controller.waiters();
}
future<> begin_flush() {
++totals.pending_flushes;
if (totals.pending_flushes >= cfg.max_active_flushes) {
++totals.flush_limit_exceeded;
clogger.trace("Flush ops overflow: {}. Will block.", totals.pending_flushes);
}
return _flush_semaphore.wait();
}
void end_flush() {
_flush_semaphore.signal();
--totals.pending_flushes;
}
segment_manager(config c);
~segment_manager() {
clogger.trace("Commitlog {} disposed", cfg.commit_log_location);
}
uint64_t next_id() {
return ++_ids;
}
std::exception_ptr sanity_check_size(size_t size) {
if (size > max_mutation_size) {
return make_exception_ptr(std::invalid_argument(
"Mutation of " + std::to_string(size)
+ " bytes is too large for the maxiumum size of "
+ std::to_string(max_mutation_size)));
}
return nullptr;
}
future<> init();
future new_segment();
future active_segment(db::timeout_clock::time_point timeout);
future allocate_segment(bool active);
future<> clear();
future<> sync_all_segments(bool shutdown = false);
future<> shutdown();
void create_counters(const sstring& metrics_category_name);
future<> orphan_all();
void add_file_to_delete(sstring, descriptor);
void add_file_to_close(file);
future<> do_pending_deletes();
future<> delete_segments(std::vector);
void discard_unused_segments();
void discard_completed_segments(const cf_id_type&);
void discard_completed_segments(const cf_id_type&, const rp_set&);
void on_timer();
void sync();
void arm(uint32_t extra = 0) {
if (!_shutdown) {
_timer.arm(std::chrono::milliseconds(cfg.commitlog_sync_period_in_ms + extra));
}
}
std::vector get_active_names() const;
uint64_t get_num_dirty_segments() const;
uint64_t get_num_active_segments() const;
using buffer_type = fragmented_temporary_buffer;
buffer_type acquire_buffer(size_t s);
future> list_descriptors(sstring dir);
flush_handler_id add_flush_handler(flush_handler h) {
auto id = ++_flush_ids;
_flush_handlers[id] = std::move(h);
return id;
}
void remove_flush_handler(flush_handler_id id) {
_flush_handlers.erase(id);
}
void flush_segments(bool = false);
private:
future<> clear_reserve_segments();
size_t max_request_controller_units() const;
segment_id_type _ids = 0;
std::vector _segments;
queue _reserve_segments;
std::unordered_map _flush_handlers;
flush_handler_id _flush_ids = 0;
replay_position _flush_position;
timer _timer;
future<> replenish_reserve();
future<> _reserve_replenisher;
seastar::gate _gate;
uint64_t _new_counter = 0;
};
template
static void write(Output& out, T value) {
auto v = net::hton(value);
out.write(reinterpret_cast(&v), sizeof(v));
}
/*
* A single commit log file on disk. Manages creation of the file and writing mutations to disk,
* as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
* files are initially allocated to a fixed size and can grow to accomidate a larger value if necessary.
*
* The IO flow is somewhat convoluted and goes something like this:
*
* Mutation path:
* - Adding data to the segment usually writes into the internal buffer
* - On EOB or overflow we issue a write to disk ("cycle").
* - A cycle call will acquire the segment read lock and send the
* buffer to the corresponding position in the file
* - If we are periodic and crossed a timing threshold, or running "batch" mode
* we might be forced to issue a flush ("sync") after adding data
* - A sync call acquires the write lock, thus locking out writes
* and waiting for pending writes to finish. It then checks the
* high data mark, and issues the actual file flush.
* Note that the write lock is released prior to issuing the
* actual file flush, thus we are allowed to write data to
* after a flush point concurrently with a pending flush.
*
* Sync timer:
* - In periodic mode, we try to primarily issue sync calls in
* a timer task issued every N seconds. The timer does the same
* operation as the above described sync, and resets the timeout
* so that mutation path will not trigger syncs and delay.
*
* Note that we do not care which order segment chunks finish writing
* to disk, other than all below a flush point must finish before flushing.
*
* We currently do not wait for flushes to finish before issueing the next
* cycle call ("after" flush point in the file). This might not be optimal.
*
* To close and finish a segment, we first close the gate object that guards
* writing data to it, then flush it fully (including waiting for futures create
* by the timer to run their course), and finally wait for it to
* become "clean", i.e. get notified that all mutations it holds have been
* persisted to sstables elsewhere. Once this is done, we can delete the
* segment. If a segment (object) is deleted without being fully clean, we
* do not remove the file on disk.
*
*/
class db::commitlog::segment : public enable_shared_from_this, public cf_holder {
friend class rp_handle;
::shared_ptr _segment_manager;
descriptor _desc;
file _file;
sstring _file_name;
uint64_t _file_pos = 0;
uint64_t _flush_pos = 0;
bool _closed = false;
using buffer_type = segment_manager::buffer_type;
using sseg_ptr = segment_manager::sseg_ptr;
using clock_type = segment_manager::clock_type;
using time_point = segment_manager::time_point;
buffer_type _buffer;
fragmented_temporary_buffer::ostream _buffer_ostream;
std::unordered_map _cf_dirty;
time_point _sync_time;
seastar::gate _gate;
uint64_t _write_waiters = 0;
utils::flush_queue, clock_type> _pending_ops;
uint64_t _num_allocs = 0;
std::unordered_set _known_schema_versions;
friend std::ostream& operator<<(std::ostream&, const segment&);
friend class segment_manager;
size_t buffer_position() const {
return _buffer.size_bytes() - _buffer_ostream.size();
}
future<> begin_flush() {
// This is maintaining the semantica of only using the write-lock
// as a gate for flushing, i.e. once we've begun a flush for position X
// we are ok with writes to positions > X
return _segment_manager->begin_flush();
}
void end_flush() {
_segment_manager->end_flush();
}
public:
struct cf_mark {
const segment& s;
};
friend std::ostream& operator<<(std::ostream&, const cf_mark&);
// The commit log entry overhead in bytes (int: length + int: head checksum + int: tail checksum)
static constexpr size_t entry_overhead_size = 3 * sizeof(uint32_t);
static constexpr size_t segment_overhead_size = 2 * sizeof(uint32_t);
static constexpr size_t descriptor_header_size = 5 * sizeof(uint32_t);
static constexpr uint32_t segment_magic = ('S'<<24) |('C'<< 16) | ('L' << 8) | 'C';
// The commit log (chained) sync marker/header size in bytes (int: length + int: checksum [segmentId, position])
static constexpr size_t sync_marker_size = 2 * sizeof(uint32_t);
static constexpr size_t alignment = 4096;
// TODO : tune initial / default size
static constexpr size_t default_size = align_up(128 * 1024, alignment);
segment(::shared_ptr m, const descriptor& d, file && f, bool active)
: _segment_manager(std::move(m)), _desc(std::move(d)), _file(std::move(f)),
_file_name(_segment_manager->cfg.commit_log_location + "/" + _desc.filename()), _sync_time(
clock_type::now()), _pending_ops(true) // want exception propagation
{
++_segment_manager->totals.segments_created;
clogger.debug("Created new {} segment {}", active ? "active" : "reserve", *this);
}
~segment() {
if (!_closed) {
_segment_manager->add_file_to_close(std::move(_file));
}
if (is_clean()) {
clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
++_segment_manager->totals.segments_destroyed;
_segment_manager->totals.total_size_on_disk -= size_on_disk();
_segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
_segment_manager->add_file_to_delete(_file_name, _desc);
} else {
clogger.warn("Segment {} is dirty and is left on disk.", *this);
}
}
bool is_schema_version_known(schema_ptr s) {
return _known_schema_versions.count(s->version());
}
void add_schema_version(schema_ptr s) {
_known_schema_versions.emplace(s->version());
}
void forget_schema_versions() {
_known_schema_versions.clear();
}
void release_cf_count(const cf_id_type& cf) override {
mark_clean(cf, 1);
if (can_delete()) {
_segment_manager->discard_unused_segments();
}
}
bool must_sync() {
if (_segment_manager->cfg.mode == sync_mode::BATCH) {
return false;
}
auto now = clock_type::now();
auto ms = std::chrono::duration_cast(
now - _sync_time).count();
if ((_segment_manager->cfg.commitlog_sync_period_in_ms * 2) < uint64_t(ms)) {
clogger.debug("{} needs sync. {} ms elapsed", *this, ms);
return true;
}
return false;
}
/**
* Finalize this segment and get a new one
*/
future finish_and_get_new(db::timeout_clock::time_point timeout) {
_closed = true;
sync();
return _segment_manager->active_segment(timeout);
}
void reset_sync_time() {
_sync_time = clock_type::now();
}
// See class comment for info
future sync(bool shutdown = false) {
/**
* If we are shutting down, we first
* close the allocation gate, thus no new
* data can be appended. Then we just issue a
* flush, which will wait for any queued ops
* to complete as well. Then we close the ops
* queue, just to be sure.
*/
if (shutdown) {
auto me = shared_from_this();
return _gate.close().then([me] {
me->_closed = true;
return me->sync().finally([me] {
// When we get here, nothing should add ops,
// and we should have waited out all pending.
return me->_pending_ops.close().finally([me] {
return me->_file.truncate(me->_flush_pos).then([me] {
return me->_file.close();
});
});
});
});
}
// Note: this is not a marker for when sync was finished.
// It is when it was initiated
reset_sync_time();
return cycle(true);
}
// See class comment for info
future flush(uint64_t pos = 0) {
auto me = shared_from_this();
assert(me.use_count() > 1);
if (pos == 0) {
pos = _file_pos;
}
clogger.trace("Syncing {} {} -> {}", *this, _flush_pos, pos);
// Only run the flush when all write ops at lower rp:s
// have completed.
replay_position rp(_desc.id, position_type(pos));
// Run like this to ensure flush ordering, and making flushes "waitable"
return _pending_ops.run_with_ordered_post_op(rp, [] { return make_ready_future<>(); }, [this, pos, me, rp] {
assert(_pending_ops.has_operation(rp));
return do_flush(pos);
});
}
future do_flush(uint64_t pos) {
auto me = shared_from_this();
return begin_flush().then([this, pos]() {
if (pos <= _flush_pos) {
clogger.trace("{} already synced! ({} < {})", *this, pos, _flush_pos);
return make_ready_future<>();
}
return _file.flush().then_wrapped([this, pos](future<> f) {
try {
f.get();
// TODO: retry/ignore/fail/stop - optional behaviour in origin.
// we fast-fail the whole commit.
_flush_pos = std::max(pos, _flush_pos);
++_segment_manager->totals.flush_count;
clogger.trace("{} synced to {}", *this, _flush_pos);
} catch (...) {
clogger.error("Failed to flush commits to disk: {}", std::current_exception());
throw;
}
});
}).finally([this] {
end_flush();
}).then([me] {
return make_ready_future(me);
});
}
/**
* Allocate a new buffer
*/
void new_buffer(size_t s) {
assert(_buffer.empty());
auto overhead = segment_overhead_size;
if (_file_pos == 0) {
overhead += descriptor_header_size;
}
auto a = align_up(s + overhead, alignment);
auto k = std::max(a, default_size);
_buffer = _segment_manager->acquire_buffer(k);
_buffer_ostream = _buffer.get_ostream();
auto out = _buffer_ostream.write_substream(overhead);
out.fill('\0', overhead);
_segment_manager->totals.total_size += k;
}
bool buffer_is_empty() const {
return buffer_position() <= segment_overhead_size
|| (_file_pos == 0 && buffer_position() <= (segment_overhead_size + descriptor_header_size));
}
/**
* Send any buffer contents to disk and get a new tmp buffer
*/
// See class comment for info
future cycle(bool flush_after = false) {
if (_buffer.empty()) {
return flush_after ? flush() : make_ready_future(shared_from_this());
}
auto size = clear_buffer_slack();
auto buf = std::exchange(_buffer, { });
auto off = _file_pos;
auto top = off + size;
auto num = _num_allocs;
_file_pos = top;
_buffer_ostream = { };
_num_allocs = 0;
auto me = shared_from_this();
assert(me.use_count() > 1);
auto out = buf.get_ostream();
auto header_size = 0;
if (off == 0) {
// first block. write file header.
write(out, segment_magic);
write(out, _desc.ver);
write(out, _desc.id);
crc32_nbo crc;
crc.process(_desc.ver);
crc.process(_desc.id & 0xffffffff);
crc.process(_desc.id >> 32);
write(out, crc.checksum());
header_size = descriptor_header_size;
}
// write chunk header
crc32_nbo crc;
crc.process(_desc.id & 0xffffffff);
crc.process(_desc.id >> 32);
crc.process(uint32_t(off + header_size));
write(out, uint32_t(_file_pos));
write(out, crc.checksum());
forget_schema_versions();
replay_position rp(_desc.id, position_type(off));
clogger.trace("Writing {} entries, {} k in {} -> {}", num, size, off, off + size);
// The write will be allowed to start now, but flush (below) must wait for not only this,
// but all previous write/flush pairs.
return _pending_ops.run_with_ordered_post_op(rp, [this, size, off, buf = std::move(buf)]() mutable { ///////////////////////////////////////////////////
auto view = fragmented_temporary_buffer::view(buf);
return do_with(off, view, [&] (uint64_t& off, fragmented_temporary_buffer::view& view) {
if (view.empty()) {
return make_ready_future<>();
}
return repeat([this, size, &off, &view] {
auto&& priority_class = service::get_local_commitlog_priority();
auto current = *view.begin();
return _file.dma_write(off, current.data(), current.size(), priority_class).then_wrapped([this, size, &off, &view](future&& f) {
try {
auto bytes = std::get<0>(f.get());
_segment_manager->totals.bytes_written += bytes;
_segment_manager->totals.total_size_on_disk += bytes;
++_segment_manager->totals.cycle_count;
if (bytes == view.size_bytes()) {
return make_ready_future(stop_iteration::yes);
}
// gah, partial write. should always get here with dma chunk sized
// "bytes", but lets make sure...
bytes = align_down(bytes, alignment);
off += bytes;
view.remove_prefix(bytes);
clogger.debug("Partial write {}: {}/{} bytes", *this, size - view.size_bytes(), size);
return make_ready_future(stop_iteration::no);
// TODO: retry/ignore/fail/stop - optional behaviour in origin.
// we fast-fail the whole commit.
} catch (...) {
clogger.error("Failed to persist commits to disk for {}: {}", *this, std::current_exception());
throw;
}
});
});
}).finally([this, buf = std::move(buf), size] {
_segment_manager->notify_memory_written(size);
});
}, [me, flush_after, top, rp] { // lambda instead of bind, so we keep "me" alive.
assert(me->_pending_ops.has_operation(rp));
return flush_after ? me->do_flush(top) : make_ready_future(me);
});
}
future batch_cycle(timeout_clock::time_point timeout) {
/**
* For batch mode we force a write "immediately".
* However, we first wait for all previous writes/flushes
* to complete.
*
* This has the benefit of allowing several allocations to
* queue up in a single buffer.
*/
auto me = shared_from_this();
auto fp = _file_pos;
return _pending_ops.wait_for_pending(timeout).then([me, fp, timeout] {
if (fp != me->_file_pos) {
// some other request already wrote this buffer.
// If so, wait for the operation at our intended file offset
// to finish, then we know the flush is complete and we
// are in accord.
// (Note: wait_for_pending(pos) waits for operation _at_ pos (and before),
replay_position rp(me->_desc.id, position_type(fp));
return me->_pending_ops.wait_for_pending(rp, timeout).then([me, fp] {
assert(me->_flush_pos > fp);
return make_ready_future(me);
});
}
// It is ok to leave the sync behind on timeout because there will be at most one
// such sync, all later allocations will block on _pending_ops until it is done.
return with_timeout(timeout, me->sync());
}).handle_exception([me, fp](auto p) {
// If we get an IO exception (which we assume this is)
// we should close the segment.
// TODO: should we also trunctate away any partial write
// we did?
me->_closed = true; // just mark segment as closed, no writes will be done.
return make_exception_future(p);
});
}
/**
* Add a "mutation" to the segment.
*/
future allocate(const cf_id_type& id, shared_ptr writer, segment_manager::request_controller_units permit, db::timeout_clock::time_point timeout) {
if (must_sync()) {
return with_timeout(timeout, sync()).then([this, id, writer = std::move(writer), permit = std::move(permit), timeout] (auto s) mutable {
return s->allocate(id, std::move(writer), std::move(permit), timeout);
});
}
const auto size = writer->size(*this);
const auto s = size + entry_overhead_size; // total size
auto ep = _segment_manager->sanity_check_size(s);
if (ep) {
return make_exception_future(std::move(ep));
}
if (!is_still_allocating() || position() + s > _segment_manager->max_size) { // would we make the file too big?
return finish_and_get_new(timeout).then([id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
});
} else if (!_buffer.empty() && (s > _buffer_ostream.size())) { // enough data?
if (_segment_manager->cfg.mode == sync_mode::BATCH) {
// TODO: this could cause starvation if we're really unlucky.
// If we run batch mode and find ourselves not fit in a non-empty
// buffer, we must force a cycle and wait for it (to keep flush order)
// This will most likely cause parallel writes, and consecutive flushes.
return with_timeout(timeout, cycle(true)).then([this, id, writer = std::move(writer), permit = std::move(permit), timeout] (auto new_seg) mutable {
return new_seg->allocate(id, std::move(writer), std::move(permit), timeout);
});
} else {
cycle().discard_result().handle_exception([] (auto ex) {
clogger.error("Failed to flush commits to disk: {}", ex);
});
}
}
size_t buf_memory = s;
if (_buffer.empty()) {
new_buffer(s);
buf_memory += buffer_position();
}
_gate.enter(); // this might throw. I guess we accept this?
buf_memory -= permit.release();
_segment_manager->account_memory_usage(buf_memory);
replay_position rp(_desc.id, position());
_cf_dirty[id]++; // increase use count for cf.
rp_handle h(static_pointer_cast(shared_from_this()), std::move(id), rp);
auto out = _buffer_ostream.write_substream(s);
crc32_nbo crc;
write(out, s);
crc.process(uint32_t(s));
write(out, crc.checksum());
// actual data
auto entry_out = out.write_substream(size);
auto entry_data = entry_out.to_input_stream();
writer->write(*this, entry_out);
entry_data.with_stream([&] (auto data_str) {
crc.process_fragmented(ser::buffer_view>::iterator>(data_str));
});
write(out, crc.checksum());
++_segment_manager->totals.allocation_count;
++_num_allocs;
_gate.leave();
if (_segment_manager->cfg.mode == sync_mode::BATCH) {
return batch_cycle(timeout).then([h = std::move(h)](auto s) mutable {
return make_ready_future(std::move(h));
});
} else {
// If this buffer alone is too big, potentially bigger than the maximum allowed size,
// then no other request will be allowed in to force the cycle()ing of this buffer. We
// have to do it ourselves.
if ((buffer_position() >= (db::commitlog::segment::default_size))) {
cycle().discard_result().handle_exception([] (auto ex) {
clogger.error("Failed to flush commits to disk: {}", ex);
});
}
return make_ready_future(std::move(h));
}
}
position_type position() const {
return position_type(_file_pos + buffer_position());
}
size_t size_on_disk() const {
return _file_pos;
}
// ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
// a.k.a. zero the tail.
size_t clear_buffer_slack() {
auto buf_pos = buffer_position();
auto size = align_up(buf_pos, alignment);
auto fill_size = size - buf_pos;
_buffer_ostream.fill('\0', fill_size);
_segment_manager->totals.bytes_slack += fill_size;
_segment_manager->account_memory_usage(fill_size);
return size;
}
void mark_clean(const cf_id_type& id, uint64_t count) {
auto i = _cf_dirty.find(id);
if (i != _cf_dirty.end()) {
assert(i->second >= count);
i->second -= count;
if (i->second == 0) {
_cf_dirty.erase(i);
}
}
}
void mark_clean(const cf_id_type& id) {
_cf_dirty.erase(id);
}
void mark_clean() {
_cf_dirty.clear();
}
bool is_still_allocating() const {
return !_closed && position() < _segment_manager->max_size;
}
bool is_clean() const {
return _cf_dirty.empty();
}
bool is_unused() const {
return !is_still_allocating() && is_clean();
}
bool is_flushed() const {
return position() <= _flush_pos;
}
bool can_delete() const {
return is_unused() && is_flushed();
}
bool contains(const replay_position& pos) const {
return pos.id == _desc.id;
}
sstring get_segment_name() const {
return _desc.filename();
}
};
future
db::commitlog::segment_manager::allocate_when_possible(const cf_id_type& id, shared_ptr writer, db::timeout_clock::time_point timeout) {
auto size = writer->size();
// If this is already too big now, we should throw early. It's also a correctness issue, since
// if we are too big at this moment we'll never reach allocate() to actually throw at that
// point.
auto ep = sanity_check_size(size);
if (ep) {
return make_exception_future(std::move(ep));
}
auto fut = get_units(_request_controller, size, timeout);
if (_request_controller.waiters()) {
totals.requests_blocked_memory++;
}
return fut.then([this, id, writer = std::move(writer), timeout] (auto permit) mutable {
return this->active_segment(timeout).then([this, timeout, id, writer = std::move(writer), permit = std::move(permit)] (auto s) mutable {
return s->allocate(id, std::move(writer), std::move(permit), timeout);
});
});
}
const size_t db::commitlog::segment::default_size;
db::commitlog::segment_manager::segment_manager(config c)
: cfg([&c] {
config cfg(c);
if (cfg.commit_log_location.empty()) {
cfg.commit_log_location = "/var/lib/scylla/commitlog";
}
if (cfg.max_active_writes == 0) {
cfg.max_active_writes = // TODO: call someone to get an idea...
25 * smp::count;
}
cfg.max_active_writes = std::max(uint64_t(1), cfg.max_active_writes / smp::count);
if (cfg.max_active_flushes == 0) {
cfg.max_active_flushes = // TODO: call someone to get an idea...
5 * smp::count;
}
cfg.max_active_flushes = std::max(uint64_t(1), cfg.max_active_flushes / smp::count);
return cfg;
}())
, max_size(std::min(std::numeric_limits::max(), std::max(cfg.commitlog_segment_size_in_mb, 1) * 1024 * 1024))
, max_mutation_size(max_size >> 1)
, max_disk_size(size_t(std::ceil(cfg.commitlog_total_space_in_mb / double(smp::count))) * 1024 * 1024)
, _flush_semaphore(cfg.max_active_flushes)
// That is enough concurrency to allow for our largest mutation (max_mutation_size), plus
// an existing in-flight buffer. Since we'll force the cycling() of any buffer that is bigger
// than default_size at the end of the allocation, that allows for every valid mutation to
// always be admitted for processing.
, _request_controller(max_request_controller_units())
, _reserve_segments(1)
, _reserve_replenisher(make_ready_future<>())
{
assert(max_size > 0);
clogger.trace("Commitlog {} maximum disk size: {} MB / cpu ({} cpus)",
cfg.commit_log_location, max_disk_size / (1024 * 1024),
smp::count);
if (!cfg.metrics_category_name.empty()) {
create_counters(cfg.metrics_category_name);
}
}
size_t db::commitlog::segment_manager::max_request_controller_units() const {
return max_mutation_size + db::commitlog::segment::default_size;
}
future<> db::commitlog::segment_manager::replenish_reserve() {
return do_until([this] { return _shutdown; }, [this] {
return _reserve_segments.not_full().then([this] {
if (_shutdown) {
return make_ready_future<>();
}
return with_gate(_gate, [this] {
return this->allocate_segment(false).then([this](sseg_ptr s) {
auto ret = _reserve_segments.push(std::move(s));
if (!ret) {
clogger.error("Segment reserve is full! Ignoring and trying to continue, but shouldn't happen");
}
return make_ready_future<>();
});
}).handle_exception([](std::exception_ptr ep) {
clogger.warn("Exception in segment reservation: {}", ep);
return sleep(100ms);
});
});
});
}
future>
db::commitlog::segment_manager::list_descriptors(sstring dirname) {
struct helper {
sstring _dirname;
file _file;
sstring _fname_prefix;
subscription _list;
std::vector _result;
helper(helper&&) = default;
helper(sstring n, sstring fname_prefix, file && f)
: _dirname(std::move(n)), _file(std::move(f)), _fname_prefix(std::move(fname_prefix)), _list(
_file.list_directory(
std::bind(&helper::process, this,
std::placeholders::_1))) {
}
future<> process(directory_entry de) {
auto entry_type = [this](const directory_entry & de) {
if (!de.type && !de.name.empty()) {
return engine().file_type(_dirname + "/" + de.name);
}
return make_ready_future>(de.type);
};
return entry_type(de).then([this, de](std::experimental::optional type) {
if (type == directory_entry_type::regular && de.name[0] != '.' && !is_cassandra_segment(de.name)) {
try {
_result.emplace_back(de.name, _fname_prefix);
} catch (std::domain_error& e) {
clogger.warn(e.what());
}
}
return make_ready_future<>();
});
}
future<> done() {
return _list.done();
}
static bool is_cassandra_segment(sstring name) {
// We want to ignore commitlog segments generated by Cassandra-derived tools (#1112)
auto c = sstring("Cassandra");
if (name.size() < c.size()) {
return false;
}
return name.substr(0, c.size()) == c;
}
};
return open_checked_directory(commit_error_handler, dirname).then([this, dirname](file dir) {
auto h = make_lw_shared(std::move(dirname), cfg.fname_prefix, std::move(dir));
return h->done().then([h]() {
return make_ready_future>(std::move(h->_result));
}).finally([h] {});
});
}
future<> db::commitlog::segment_manager::init() {
return list_descriptors(cfg.commit_log_location).then([this](std::vector descs) {
assert(_reserve_segments.empty()); // _segments_to_replay must not pick them up
segment_id_type id = std::chrono::duration_cast(runtime::get_boot_time().time_since_epoch()).count() + 1;
for (auto& d : descs) {
id = std::max(id, replay_position(d.id).base_id());
_segments_to_replay.push_back(cfg.commit_log_location + "/" + d.filename());
}
// base id counter is [ | ]
_ids = replay_position(engine().cpu_id(), id).id;
// always run the timer now, since we need to handle segment pre-alloc etc as well.
_timer.set_callback(std::bind(&segment_manager::on_timer, this));
auto delay = engine().cpu_id() * std::ceil(double(cfg.commitlog_sync_period_in_ms) / smp::count);
clogger.trace("Delaying timer loop {} ms", delay);
// We need to wait until we have scanned all other segments to actually start serving new
// segments. We are ready now
this->_reserve_replenisher = replenish_reserve();
this->arm(delay);
});
}
void db::commitlog::segment_manager::create_counters(const sstring& metrics_category_name) {
namespace sm = seastar::metrics;
_metrics.add_group(metrics_category_name, {
sm::make_gauge("segments", [this] { return _segments.size(); },
sm::description("Holds the current number of segments.")),
sm::make_gauge("allocating_segments", [this] { return std::count_if(_segments.begin(), _segments.end(), [] (const sseg_ptr & s) { return s->is_still_allocating(); }); },
sm::description("Holds the number of not closed segments that still have some free space. "
"This value should not get too high.")),
sm::make_gauge("unused_segments", [this] { return std::count_if(_segments.begin(), _segments.end(), [] (const sseg_ptr & s) { return s->is_unused(); }); },
sm::description("Holds the current number of unused segments. "
"A non-zero value indicates that the disk write path became temporary slow.")),
sm::make_derive("alloc", totals.allocation_count,
sm::description("Counts a number of times a new mutation has been added to a segment. "
"Divide bytes_written by this value to get the average number of bytes per mutation written to the disk.")),
sm::make_derive("cycle", totals.cycle_count,
sm::description("Counts a number of commitlog write cycles - when the data is written from the internal memory buffer to the disk.")),
sm::make_derive("flush", totals.flush_count,
sm::description("Counts a number of times the flush() method was called for a file.")),
sm::make_derive("bytes_written", totals.bytes_written,
sm::description("Counts a number of bytes written to the disk. "
"Divide this value by \"alloc\" to get the average number of bytes per mutation written to the disk.")),
sm::make_derive("slack", totals.bytes_slack,
sm::description("Counts a number of unused bytes written to the disk due to disk segment alignment.")),
sm::make_gauge("pending_flushes", totals.pending_flushes,
sm::description("Holds a number of currently pending flushes. See the related flush_limit_exceeded metric.")),
sm::make_gauge("pending_allocations", [this] { return pending_allocations(); },
sm::description("Holds a number of currently pending allocations. "
"A non-zero value indicates that we have a bottleneck in the disk write flow.")),
sm::make_derive("requests_blocked_memory", totals.requests_blocked_memory,
sm::description("Counts a number of requests blocked due to memory pressure. "
"A non-zero value indicates that the commitlog memory quota is not enough to serve the required amount of requests.")),
sm::make_derive("flush_limit_exceeded", totals.flush_limit_exceeded,
sm::description(
seastar::format("Counts a number of times a flush limit was exceeded. "
"A non-zero value indicates that there are too many pending flush operations (see pending_flushes) and some of "
"them will be blocked till the total amount of pending flush operations drops below {}.", cfg.max_active_flushes))),
sm::make_gauge("disk_total_bytes", totals.total_size,
sm::description("Holds a size of disk space in bytes used for data so far. "
"A too high value indicates that we have some bottleneck in the writing to sstables path.")),
sm::make_gauge("memory_buffer_bytes", totals.buffer_list_bytes,
sm::description("Holds the total number of bytes in internal memory buffers.")),
});
}
void db::commitlog::segment_manager::flush_segments(bool force) {
if (_segments.empty()) {
return;
}
// defensive copy.
auto callbacks = boost::copy_range>(_flush_handlers | boost::adaptors::map_values);
auto& active = _segments.back();
// RP at "start" of segment we leave untouched.
replay_position high(active->_desc.id, 0);
// But if all segments are closed or we force-flush,
// include all.
if (force || !active->is_still_allocating()) {
high = replay_position(high.id + 1, 0);
}
// Now get a set of used CF ids:
std::unordered_set ids;
std::for_each(_segments.begin(), _segments.end() - 1, [&ids](sseg_ptr& s) {
for (auto& id : s->_cf_dirty | boost::adaptors::map_keys) {
ids.insert(id);
}
});
clogger.debug("Flushing ({}) to {}", force, high);
// For each CF id: for each callback c: call c(id, high)
for (auto& f : callbacks) {
for (auto& id : ids) {
try {
f(id, high);
} catch (...) {
clogger.error("Exception during flush request {}/{}: {}", id, high, std::current_exception());
}
}
}
}
future db::commitlog::segment_manager::allocate_segment(bool active) {
static const auto flags = open_flags::wo | open_flags::create;
descriptor d(next_id(), cfg.fname_prefix);
file_open_options opt;
opt.extent_allocation_size_hint = max_size;
auto filename = cfg.commit_log_location + "/" + d.filename();
auto fut = do_io_check(commit_error_handler, [&] {
auto fut = open_file_dma(filename, flags, opt);
if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
fut = fut.then([this, filename](file f) {
return do_with(std::move(f), [this, filename](file& f) {
auto ext_range = cfg.extensions->commitlog_file_extensions();
return do_for_each(ext_range.begin(), ext_range.end(), [&f, filename](auto& ext) {
// note: we're potentially wrapping more than once. extension mechanism
// is responsible for order being sane.
return ext->wrap_file(filename, f, flags).then([&f](file of) {
if (of) {
f = std::move(of);
}
});
}).then([&f] {
return f;
});
});
});
}
return fut;
});
return fut.then([this, d, active, filename](file f) {
f = make_checked_file(commit_error_handler, f);
// xfs doesn't like files extended betond eof, so enlarge the file
return f.truncate(max_size).then([this, d, active, f, filename] () mutable {
auto s = make_shared(this->shared_from_this(), d, std::move(f), active);
return make_ready_future(s);
});
});
}
future db::commitlog::segment_manager::new_segment() {
if (_shutdown) {
throw std::runtime_error("Commitlog has been shut down. Cannot add data");
}
++_new_counter;
if (_reserve_segments.empty() && (_reserve_segments.max_size() < cfg.max_reserve_segments)) {
_reserve_segments.set_max_size(_reserve_segments.max_size() + 1);
clogger.debug("Increased segment reserve count to {}", _reserve_segments.max_size());
}
return _reserve_segments.pop_eventually().then([this] (auto s) {
_segments.push_back(std::move(s));
_segments.back()->reset_sync_time();
return make_ready_future(_segments.back());
});
}
future db::commitlog::segment_manager::active_segment(db::timeout_clock::time_point timeout) {
// If there is no active segment, try to allocate one using new_segment(). If we time out,
// make sure later invocations can still pick that segment up once it's ready.
return repeat_until_value([this, timeout] () -> future> {
if (!_segments.empty() && _segments.back()->is_still_allocating()) {
return make_ready_future>(_segments.back());
}
return [this, timeout] {
if (!_segment_allocating) {
promise<> p;
_segment_allocating.emplace(p.get_future());
auto f = _segment_allocating->get_future(timeout);
with_gate(_gate, [this] {
return new_segment().discard_result().finally([this]() {
_segment_allocating = stdx::nullopt;
});
}).forward_to(std::move(p));
return f;
} else {
return _segment_allocating->get_future(timeout);
}
}().then([] () -> stdx::optional {
return stdx::nullopt;
});
});
}
/**
* go through all segments, clear id up to pos. if segment becomes clean and unused by this,
* it is discarded.
*/
void db::commitlog::segment_manager::discard_completed_segments(const cf_id_type& id, const rp_set& used) {
auto& usage = used.usage();
clogger.debug("Discarding {}: {}", id, usage);
for (auto&s : _segments) {
auto i = usage.find(s->_desc.id);
if (i != usage.end()) {
s->mark_clean(id, i->second);
}
}
discard_unused_segments();
}
void db::commitlog::segment_manager::discard_completed_segments(const cf_id_type& id) {
clogger.debug("Discard all data for {}", id);
for (auto&s : _segments) {
s->mark_clean(id);
}
discard_unused_segments();
}
namespace db {
std::ostream& operator<<(std::ostream& out, const db::commitlog::segment& s) {
return out << s._desc.filename();
}
std::ostream& operator<<(std::ostream& out, const db::commitlog::segment::cf_mark& m) {
return out << (m.s._cf_dirty | boost::adaptors::map_keys);
}
std::ostream& operator<<(std::ostream& out, const db::replay_position& p) {
return out << "{" << p.shard_id() << ", " << p.base_id() << ", " << p.pos << "}";
}
}
void db::commitlog::segment_manager::discard_unused_segments() {
clogger.trace("Checking for unused segments ({} active)", _segments.size());
auto i = std::remove_if(_segments.begin(), _segments.end(), [=](sseg_ptr s) {
if (s->can_delete()) {
clogger.debug("Segment {} is unused", *s);
return true;
}
if (s->is_still_allocating()) {
clogger.debug("Not safe to delete segment {}; still allocating.", s);
} else if (!s->is_clean()) {
clogger.debug("Not safe to delete segment {}; dirty is {}", s, segment::cf_mark {*s});
} else {
clogger.debug("Not safe to delete segment {}; disk ops pending", s);
}
return false;
});
if (i != _segments.end()) {
_segments.erase(i, _segments.end());
}
// launch in background, but guard with gate so this deletion is
// sure to finish in shutdown, because at least through this path,
// segments on deletion queue could be non-empty, and we don't want
// those accidentally left around for replay.
if (!_shutdown) {
with_gate(_gate, [this] {
return do_pending_deletes();
});
}
}
future<> db::commitlog::segment_manager::clear_reserve_segments() {
while (!_reserve_segments.empty()) {
_reserve_segments.pop();
}
return do_pending_deletes();
}
future<> db::commitlog::segment_manager::sync_all_segments(bool shutdown) {
clogger.debug("Issuing sync for all segments ({})", shutdown ? "shutdown" : "active");
return parallel_for_each(_segments, [this, shutdown](sseg_ptr s) {
return s->sync(shutdown).then([](sseg_ptr s) {
clogger.debug("Synced segment {}", *s);
});
});
}
future<> db::commitlog::segment_manager::shutdown() {
if (!_shutdown_promise) {
_shutdown_promise = shared_promise<>();
// Wait for all pending requests to finish. Need to sync first because segments that are
// alive may be holding semaphore permits.
auto block_new_requests = get_units(_request_controller, max_request_controller_units());
return sync_all_segments(false).then([this, block_new_requests = std::move(block_new_requests)] () mutable {
return std::move(block_new_requests).then([this] (auto permits) {
_timer.cancel(); // no more timer calls
_shutdown = true; // no re-arm, no create new segments.
// Now first wait for periodic task to finish, then sync and close all
// segments, flushing out any remaining data.
return _gate.close().then(std::bind(&segment_manager::sync_all_segments, this, true)).finally([permits = std::move(permits)] { });
});
}).finally([this] {
discard_unused_segments();
// Now that the gate is closed and requests completed we are sure nobody else will pop()
return clear_reserve_segments().finally([this] {
return std::move(_reserve_replenisher).then_wrapped([this] (auto f) {
// Could be cleaner with proper seastar support
if (f.failed()) {
_shutdown_promise->set_exception(f.get_exception());
} else {
_shutdown_promise->set_value();
}
});
});
});
}
return _shutdown_promise->get_shared_future();
}
void db::commitlog::segment_manager::add_file_to_delete(sstring filename, descriptor d) {
assert(!_files_to_delete.count(filename));
_files_to_delete.emplace(std::move(filename), std::move(d));
}
void db::commitlog::segment_manager::add_file_to_close(file f) {
_files_to_close.emplace_back(std::move(f));
}
future<> db::commitlog::segment_manager::delete_segments(std::vector files) {
auto i = files.begin();
auto e = files.end();
return parallel_for_each(i, e, [this](auto& filename) {
auto f = make_ready_future();
auto exts = cfg.extensions;
if (exts && !exts->commitlog_file_extensions().empty()) {
f = parallel_for_each(exts->commitlog_file_extensions(), [&](auto& ext) {
return ext->before_delete(filename);
});
}
return f.finally([&] {
clogger.debug("Deleting segment file {}", filename);
return commit_io_check(&seastar::remove_file, filename);
}).handle_exception([&filename](auto ep) {
clogger.error("Could not delete segment {}: {}", filename, ep);
});
}).finally([files = std::move(files)] {});
}
future<> db::commitlog::segment_manager::do_pending_deletes() {
auto ftc = std::exchange(_files_to_close, {});
auto i = ftc.begin();
auto e = ftc.end();
return parallel_for_each(i, e, [](file & f) {
return f.close();
}).then([this, ftc = std::move(ftc)] {
return delete_segments(boost::copy_range>(std::exchange(_files_to_delete, {}) | boost::adaptors::map_keys));
});
}
future<> db::commitlog::segment_manager::orphan_all() {
_segments.clear();
return clear_reserve_segments();
}
/*
* Sync all segments, then clear them out. To ensure all ops are done.
* (Assumes you have barriered adding ops!)
* Only use from tests.
*/
future<> db::commitlog::segment_manager::clear() {
clogger.debug("Clearing commitlog");
return shutdown().then([this] {
clogger.debug("Clearing all segments");
for (auto& s : _segments) {
s->mark_clean();
}
return orphan_all();
});
}
/**
* Called by timer in periodic mode.
*/
void db::commitlog::segment_manager::sync() {
for (auto s : _segments) {
s->sync(); // we do not care about waiting...
}
}
void db::commitlog::segment_manager::on_timer() {
// Gate, because we are starting potentially blocking ops
// without waiting for them, so segement_manager could be shut down
// while they are running.
seastar::with_gate(_gate, [this] {
if (cfg.mode != sync_mode::BATCH) {
sync();
}
// IFF a new segment was put in use since last we checked, and we're
// above threshold, request flush.
if (_new_counter > 0) {
auto max = max_disk_size;
auto cur = totals.total_size_on_disk;
if (max != 0 && cur >= max) {
_new_counter = 0;
clogger.debug("Size on disk {} MB exceeds local maximum {} MB", cur / (1024 * 1024), max / (1024 * 1024));
flush_segments();
}
}
return do_pending_deletes();
});
arm();
}
std::vector db::commitlog::segment_manager::get_active_names() const {
std::vector res;
for (auto i: _segments) {
if (!i->is_unused()) {
// Each shared is located in its own directory
res.push_back(cfg.commit_log_location + "/" + i->get_segment_name());
}
}
return res;
}
uint64_t db::commitlog::segment_manager::get_num_dirty_segments() const {
return std::count_if(_segments.begin(), _segments.end(), [](sseg_ptr s) {
return !s->is_still_allocating() && !s->is_clean();
});
}
uint64_t db::commitlog::segment_manager::get_num_active_segments() const {
return std::count_if(_segments.begin(), _segments.end(), [](sseg_ptr s) {
return s->is_still_allocating();
});
}
db::commitlog::segment_manager::buffer_type db::commitlog::segment_manager::acquire_buffer(size_t s) {
s = align_up(s, segment::default_size);
auto fragment_count = s / segment::default_size;
std::vector> buffers;
buffers.reserve(fragment_count);
while (buffers.size() < fragment_count) {
auto a = ::memalign(segment::alignment, segment::default_size);
if (a == nullptr) {
throw std::bad_alloc();
}
buffers.emplace_back(static_cast(a), segment::default_size, make_free_deleter(a));
}
clogger.trace("Allocated {} k buffer", s / 1024);
return fragmented_temporary_buffer(std::move(buffers), s);
}
/**
* Add mutation.
*/
future db::commitlog::add(const cf_id_type& id,
size_t size, db::timeout_clock::time_point timeout, serializer_func func) {
class serializer_func_entry_writer final : public entry_writer {
serializer_func _func;
size_t _size;
public:
serializer_func_entry_writer(size_t sz, serializer_func func)
: _func(std::move(func)), _size(sz)
{ }
virtual size_t size(segment&) override { return _size; }
virtual size_t size() override { return _size; }
virtual void write(segment&, output& out) override {
_func(out);
}
};
auto writer = ::make_shared(size, std::move(func));
return _segment_manager->allocate_when_possible(id, writer, timeout);
}
future db::commitlog::add_entry(const cf_id_type& id, const commitlog_entry_writer& cew, timeout_clock::time_point timeout)
{
class cl_entry_writer final : public entry_writer {
commitlog_entry_writer _writer;
public:
cl_entry_writer(const commitlog_entry_writer& wr) : _writer(wr) { }
virtual size_t size(segment& seg) override {
_writer.set_with_schema(!seg.is_schema_version_known(_writer.schema()));
return _writer.size();
}
virtual size_t size() override {
return _writer.mutation_size();
}
virtual void write(segment& seg, output& out) override {
if (_writer.with_schema()) {
seg.add_schema_version(_writer.schema());
}
_writer.write(out);
}
};
auto writer = ::make_shared(cew);
return _segment_manager->allocate_when_possible(id, writer, timeout);
}
db::commitlog::commitlog(config cfg)
: _segment_manager(::make_shared(std::move(cfg))) {
}
db::commitlog::commitlog(commitlog&& v) noexcept
: _segment_manager(std::move(v._segment_manager)) {
}
db::commitlog::~commitlog()
{}
future db::commitlog::create_commitlog(config cfg) {
commitlog c(std::move(cfg));
auto f = c._segment_manager->init();
return f.then([c = std::move(c)]() mutable {
return make_ready_future(std::move(c));
});
}
db::commitlog::flush_handler_anchor::flush_handler_anchor(flush_handler_anchor&& f)
: _cl(f._cl), _id(f._id)
{
f._id = 0;
}
db::commitlog::flush_handler_anchor::flush_handler_anchor(commitlog& cl, flush_handler_id id)
: _cl(cl), _id(id)
{}
db::commitlog::flush_handler_anchor::~flush_handler_anchor() {
unregister();
}
db::commitlog::flush_handler_id db::commitlog::flush_handler_anchor::release() {
flush_handler_id id = 0;
std::swap(_id, id);
return id;
}
void db::commitlog::flush_handler_anchor::unregister() {
auto id = release();
if (id != 0) {
_cl.remove_flush_handler(id);
}
}
db::commitlog::flush_handler_anchor db::commitlog::add_flush_handler(flush_handler h) {
return flush_handler_anchor(*this, _segment_manager->add_flush_handler(std::move(h)));
}
void db::commitlog::remove_flush_handler(flush_handler_id id) {
_segment_manager->remove_flush_handler(id);
}
void db::commitlog::discard_completed_segments(const cf_id_type& id, const rp_set& used) {
_segment_manager->discard_completed_segments(id, used);
}
void db::commitlog::discard_completed_segments(const cf_id_type& id) {
_segment_manager->discard_completed_segments(id);
}
future<> db::commitlog::sync_all_segments() {
return _segment_manager->sync_all_segments();
}
future<> db::commitlog::shutdown() {
return _segment_manager->shutdown();
}
future<> db::commitlog::release() {
return _segment_manager->orphan_all();
}
size_t db::commitlog::max_record_size() const {
return _segment_manager->max_mutation_size - segment::entry_overhead_size;
}
uint64_t db::commitlog::max_active_writes() const {
return _segment_manager->cfg.max_active_writes;
}
uint64_t db::commitlog::max_active_flushes() const {
return _segment_manager->cfg.max_active_flushes;
}
future<> db::commitlog::clear() {
return _segment_manager->clear();
}
const db::commitlog::config& db::commitlog::active_config() const {
return _segment_manager->cfg;
}
// No commit_io_check needed in the log reader since the database will fail
// on error at startup if required
future, db::replay_position>>>
db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
struct work {
private:
file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
file_input_stream_options fo;
fo.buffer_size = db::commitlog::segment::default_size;
fo.read_ahead = 10;
fo.io_priority_class = read_io_prio_class;
return fo;
}
public:
file f;
stream, replay_position> s;
input_stream fin;
input_stream r;
uint64_t id = 0;
size_t pos = 0;
size_t next = 0;
size_t start_off = 0;
size_t skip_to = 0;
size_t file_size = 0;
size_t corrupt_size = 0;
bool eof = false;
bool header = true;
bool failed = false;
work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
}
work(work&&) = default;
bool advance(const temporary_buffer& buf) {
pos += buf.size();
if (buf.size() == 0) {
eof = true;
}
return !eof;
}
bool end_of_file() const {
return eof;
}
bool end_of_chunk() const {
return eof || next == pos;
}
future<> skip(size_t bytes) {
skip_to = pos + bytes;
return do_until([this] { return pos == skip_to || eof; }, [this, bytes] {
auto s = std::min(4096, skip_to - pos);
// should eof be an error here?
return fin.read_exactly(s).then([this](auto buf) {
this->advance(buf);
});
});
}
future<> stop() {
eof = true;
return make_ready_future<>();
}
future<> fail() {
failed = true;
return stop();
}
future<> read_header() {
return fin.read_exactly(segment::descriptor_header_size).then([this](temporary_buffer buf) {
if (!advance(buf)) {
// zero length file. accept it just to be nice.
return make_ready_future<>();
}
// Will throw if we got eof
data_input in(buf);
auto magic = in.read();
auto ver = in.read();
auto id = in.read();
auto checksum = in.read();
if (magic == 0 && ver == 0 && id == 0 && checksum == 0) {
// let's assume this was an empty (pre-allocated)
// file. just skip it.
return stop();
}
if (magic != segment::segment_magic) {
throw std::invalid_argument("Not a scylla format commitlog file");
}
crc32_nbo crc;
crc.process(ver);
crc.process(id & 0xffffffff);
crc.process(id >> 32);
auto cs = crc.checksum();
if (cs != checksum) {
throw std::runtime_error("Checksum error in file header");
}
this->id = id;
this->next = 0;
return make_ready_future<>();
});
}
future<> read_chunk() {
return fin.read_exactly(segment::segment_overhead_size).then([this](temporary_buffer buf) {
auto start = pos;
if (!advance(buf)) {
return make_ready_future<>();
}
data_input in(buf);
auto next = in.read();
auto checksum = in.read();
if (next == 0 && checksum == 0) {
// in a pre-allocating world, this means eof
return stop();
}
crc32_nbo crc;
crc.process(id & 0xffffffff);
crc.process(id >> 32);
crc.process(start);
auto cs = crc.checksum();
if (cs != checksum) {
// if a chunk header checksum is broken, we shall just assume that all
// remaining is as well. We cannot trust the "next" pointer, so...
clogger.debug("Checksum error in segment chunk at {}.", pos);
corrupt_size += (file_size - pos);
return stop();
}
this->next = next;
if (start_off >= next) {
return skip(next - pos);
}
return do_until(std::bind(&work::end_of_chunk, this), std::bind(&work::read_entry, this));
});
}
future<> read_entry() {
static constexpr size_t entry_header_size = segment::entry_overhead_size - sizeof(uint32_t);
/**
* #598 - Must check that data left in chunk is enough to even read an entry.
* If not, this is small slack space in the chunk end, and we should just go
* to the next.
*/
assert(pos <= next);
if ((pos + entry_header_size) >= next) {
return skip(next - pos);
}
return fin.read_exactly(entry_header_size).then([this](temporary_buffer buf) {
replay_position rp(id, position_type(pos));
if (!advance(buf)) {
return make_ready_future<>();
}
data_input in(buf);
auto size = in.read();
auto checksum = in.read();
crc32_nbo crc;
crc.process(size);
if (size < 3 * sizeof(uint32_t) || checksum != crc.checksum()) {
auto slack = next - pos;
if (size != 0) {
clogger.debug("Segment entry at {} has broken header. Skipping to next chunk ({} bytes)", rp, slack);
corrupt_size += slack;
}
// size == 0 -> special scylla case: zero padding due to dma blocks
return skip(slack);
}
return fin.read_exactly(size - entry_header_size).then([this, size, crc = std::move(crc), rp](temporary_buffer buf) mutable {
advance(buf);
data_input in(buf);
auto data_size = size - segment::entry_overhead_size;
in.skip(data_size);
auto checksum = in.read();
crc.process_bytes(buf.get(), data_size);
if (crc.checksum() != checksum) {
// If we're getting a checksum error here, most likely the rest of
// the file will be corrupt as well. But it does not hurt to retry.
// Just go to the next entry (since "size" in header seemed ok).
clogger.debug("Segment entry at {} checksum error. Skipping {} bytes", rp, size);
corrupt_size += size;
return make_ready_future<>();
}
return s.produce(buf.share(0, data_size), rp).handle_exception([this](auto ep) {
return this->fail();
});
});
});
}
future<> read_file() {
return f.size().then([this](uint64_t size) {
file_size = size;
}).then([this] {
return read_header().then(
[this] {
return do_until(std::bind(&work::end_of_file, this), std::bind(&work::read_chunk, this));
}).then([this] {
if (corrupt_size > 0) {
throw segment_data_corruption_error("Data corruption", corrupt_size);
}
});
}).finally([this] {
return fin.close();
});
}
};
auto fut = do_io_check(commit_error_handler, [&] {
auto fut = open_file_dma(filename, open_flags::ro);
if (exts && !exts->commitlog_file_extensions().empty()) {
fut = fut.then([filename, exts](file f) {
return do_with(std::move(f), [filename, exts](file& f) {
auto ext_range = exts->commitlog_file_extensions() | boost::adaptors::reversed;
return do_for_each(ext_range.begin(), ext_range.end(), [&f, filename](auto& ext) {
// note: we're potentially wrapping more than once. extension mechanism
// is responsible for order being sane.
return ext->wrap_file(filename, f, open_flags::ro).then([&f](file of) {
if (of) {
f = std::move(of);
}
});
}).then([&f] {
return make_ready_future(f);
});
});
});
}
return fut;
});
return fut.then([off, next, read_io_prio_class] (file f) {
f = make_checked_file(commit_error_handler, std::move(f));
auto w = make_lw_shared(std::move(f), read_io_prio_class, off);
auto ret = w->s.listen(next);
w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
if (!w->failed) {
w->s.close();
}
}).handle_exception([w](auto ep) {
w->s.set_exception(ep);
});
return std::make_unique, db::replay_position>>(std::move(ret));
});
}
std::vector db::commitlog::get_active_segment_names() const {
return _segment_manager->get_active_names();
}
uint64_t db::commitlog::get_total_size() const {
return _segment_manager->totals.total_size;
}
uint64_t db::commitlog::get_completed_tasks() const {
return _segment_manager->totals.allocation_count;
}
uint64_t db::commitlog::get_flush_count() const {
return _segment_manager->totals.flush_count;
}
uint64_t db::commitlog::get_pending_tasks() const {
return _segment_manager->totals.pending_flushes;
}
uint64_t db::commitlog::get_pending_flushes() const {
return _segment_manager->totals.pending_flushes;
}
uint64_t db::commitlog::get_pending_allocations() const {
return _segment_manager->pending_allocations();
}
uint64_t db::commitlog::get_flush_limit_exceeded_count() const {
return _segment_manager->totals.flush_limit_exceeded;
}
uint64_t db::commitlog::get_num_segments_created() const {
return _segment_manager->totals.segments_created;
}
uint64_t db::commitlog::get_num_segments_destroyed() const {
return _segment_manager->totals.segments_destroyed;
}
uint64_t db::commitlog::get_num_dirty_segments() const {
return _segment_manager->get_num_dirty_segments();
}
uint64_t db::commitlog::get_num_active_segments() const {
return _segment_manager->get_num_active_segments();
}
future> db::commitlog::list_existing_descriptors() const {
return list_existing_descriptors(active_config().commit_log_location);
}
future> db::commitlog::list_existing_descriptors(const sstring& dir) const {
return _segment_manager->list_descriptors(dir);
}
future> db::commitlog::list_existing_segments() const {
return list_existing_segments(active_config().commit_log_location);
}
future> db::commitlog::list_existing_segments(const sstring& dir) const {
return list_existing_descriptors(dir).then([dir](auto descs) {
std::vector paths;
std::transform(descs.begin(), descs.end(), std::back_inserter(paths), [&](auto& d) {
return dir + "/" + d.filename();
});
return make_ready_future>(std::move(paths));
});
}
std::vector db::commitlog::get_segments_to_replay() const {
return std::move(_segment_manager->_segments_to_replay);
}
future<> db::commitlog::delete_segments(std::vector files) const {
return _segment_manager->delete_segments(std::move(files));
}
db::rp_handle::rp_handle() noexcept
{}
db::rp_handle::rp_handle(shared_ptr h, cf_id_type cf, replay_position rp) noexcept
: _h(std::move(h)), _cf(cf), _rp(rp)
{}
db::rp_handle::rp_handle(rp_handle&& v) noexcept
: _h(std::move(v._h)), _cf(v._cf), _rp(std::exchange(v._rp, {}))
{}
db::rp_handle& db::rp_handle::operator=(rp_handle&& v) noexcept {
if (this != &v) {
this->~rp_handle();
new (this) rp_handle(std::move(v));
}
return *this;
}
db::rp_handle::~rp_handle() {
if (_rp != replay_position() && _h) {
_h->release_cf_count(_cf);
}
}
db::replay_position db::rp_handle::release() {
return std::exchange(_rp, {});
}