Fix field initialization order problem.
In file included from ./sstables/mc/bsearch_clustered_cursor.hh:28,
from sstables/index_reader.hh:32,
from sstables/sstables.cc:49:
./utils/cached_file.hh: In constructor 'cached_file::stream::stream(cached_file&, const seastar::io_priority_class&, tracing::trace_state_ptr, cached_file::page_idx_type, cached_file::offset_type)':
./utils/cached_file.hh:119:34: error: 'cached_file::stream::_trace_state' will be initialized after [-Werror=reorder]
119 | tracing::trace_state_ptr _trace_state;
| ^~~~~~~~~~~~
./utils/cached_file.hh:117:23: error: 'cached_file::page_idx_type cached_file::stream::_page_idx' [-Werror=reorder]
117 | page_idx_type _page_idx;
| ^~~~~~~~~
./utils/cached_file.hh:127:9: error: when initialized here [-Werror=reorder]
127 | stream(cached_file& cf, const io_priority_class& pc, tracing::trace_state_ptr trace_state,
| ^~~~~~
Message-Id: <1592478082-22505-1-git-send-email-tgrabiec@scylladb.com>
279 lines
11 KiB
C++
279 lines
11 KiB
C++
/*
|
|
* Copyright (C) 2019 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "reader_permit.hh"
|
|
#include "utils/div_ceil.hh"
|
|
#include "tracing/trace_state.hh"
|
|
|
|
#include <seastar/core/file.hh>
|
|
|
|
#include <map>
|
|
|
|
using namespace seastar;
|
|
|
|
/// \brief A read-through cache of a file.
|
|
///
|
|
/// Caches contents with page granularity (4 KiB).
|
|
/// Cached pages are evicted manually using the invalidate_*() method family, or when the object is destroyed.
|
|
///
|
|
/// Concurrent reading is allowed.
|
|
///
|
|
/// The object is movable but this is only allowed before readers are created.
|
|
///
|
|
/// The cached_file can represent a subset of the file. The reason for this is so to satisfy
|
|
/// two requirements. One is that we have a page-aligned caching, where pages are aligned
|
|
/// relative to the start of the underlying file. This matches requirements of the seastar I/O engine
|
|
/// on I/O requests.
|
|
/// Another requirement is to have an effective way to populate the cache using an unaligned buffer
|
|
/// which starts in the middle of the file when we know that we won't need to access bytes located
|
|
/// before the buffer's position. See populate_front(). If we couldn't assume that, we wouldn't be
|
|
/// able to insert an unaligned buffer into the cache.
|
|
///
|
|
class cached_file {
|
|
public:
|
|
// Must be aligned to _file.disk_read_dma_alignment(). 4K is always safe.
|
|
static constexpr size_t page_size = 4096;
|
|
|
|
// The content of the underlying file (_file) is divided into pages
|
|
// of equal size (page_size). This type is used to identify pages.
|
|
// Pages are assigned consecutive identifiers starting from 0.
|
|
using page_idx_type = uint64_t;
|
|
|
|
using offset_type = uint64_t;
|
|
|
|
struct metrics {
|
|
uint64_t page_hits = 0;
|
|
uint64_t page_misses = 0;
|
|
uint64_t page_evictions = 0;
|
|
uint64_t page_populations = 0;
|
|
uint64_t cached_bytes = 0;
|
|
};
|
|
private:
|
|
struct cached_page {
|
|
temporary_buffer<char> buf;
|
|
explicit cached_page(temporary_buffer<char> buf) : buf(std::move(buf)) {}
|
|
};
|
|
|
|
file _file;
|
|
sstring _file_name; // for logging / tracing
|
|
reader_permit _permit;
|
|
metrics& _metrics;
|
|
|
|
using cache_type = std::map<page_idx_type, cached_page>;
|
|
cache_type _cache;
|
|
|
|
const offset_type _start;
|
|
const offset_type _size;
|
|
|
|
offset_type _last_page_size; // Ignores _start in case the start lies on the same page.
|
|
page_idx_type _last_page;
|
|
private:
|
|
future<temporary_buffer<char>> get_page(page_idx_type idx, const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state) {
|
|
auto i = _cache.lower_bound(idx);
|
|
if (i != _cache.end() && i->first == idx) {
|
|
++_metrics.page_hits;
|
|
tracing::trace(trace_state, "page cache hit: file={}, page={}", _file_name, idx);
|
|
cached_page& cp = i->second;
|
|
return make_ready_future<temporary_buffer<char>>(cp.buf.share());
|
|
}
|
|
tracing::trace(trace_state, "page cache miss: file={}, page={}", _file_name, idx);
|
|
++_metrics.page_misses;
|
|
auto size = idx == _last_page ? _last_page_size : page_size;
|
|
return _file.dma_read_exactly<char>(idx * page_size, size, pc)
|
|
.then([this, idx] (temporary_buffer<char>&& buf) mutable {
|
|
++_metrics.page_populations;
|
|
_metrics.cached_bytes += buf.size();
|
|
_cache.emplace(idx, cached_page(buf.share()));
|
|
return std::move(buf);
|
|
});
|
|
}
|
|
public:
|
|
// Generator of subsequent pages of data reflecting the contents of the file.
|
|
// Single-user.
|
|
class stream {
|
|
cached_file* _cached_file;
|
|
const io_priority_class* _pc;
|
|
page_idx_type _page_idx;
|
|
offset_type _offset_in_page;
|
|
tracing::trace_state_ptr _trace_state;
|
|
public:
|
|
// Creates an empty stream.
|
|
stream()
|
|
: _cached_file(nullptr)
|
|
, _pc(nullptr)
|
|
{ }
|
|
|
|
stream(cached_file& cf, const io_priority_class& pc, tracing::trace_state_ptr trace_state,
|
|
page_idx_type start_page, offset_type start_offset_in_page)
|
|
: _cached_file(&cf)
|
|
, _pc(&pc)
|
|
, _page_idx(start_page)
|
|
, _offset_in_page(start_offset_in_page)
|
|
, _trace_state(std::move(trace_state))
|
|
{ }
|
|
|
|
// Yields the next chunk of data.
|
|
// Returns empty buffer when end-of-stream is reached.
|
|
// Calls must be serialized.
|
|
// This instance must be kept alive until the returned future resolves.
|
|
future<temporary_buffer<char>> next() {
|
|
if (!_cached_file || _page_idx > _cached_file->_last_page) {
|
|
return make_ready_future<temporary_buffer<char>>(temporary_buffer<char>());
|
|
}
|
|
return _cached_file->get_page(_page_idx, *_pc, _trace_state).then([this] (temporary_buffer<char> page) {
|
|
if (_page_idx == _cached_file->_last_page) {
|
|
page.trim(_cached_file->_last_page_size);
|
|
}
|
|
page.trim_front(_offset_in_page);
|
|
_offset_in_page = 0;
|
|
++_page_idx;
|
|
return page;
|
|
});
|
|
}
|
|
};
|
|
|
|
size_t evict_range(cache_type::iterator start, cache_type::iterator end) noexcept {
|
|
size_t count = 0;
|
|
while (start != end) {
|
|
++count;
|
|
_metrics.cached_bytes -= start->second.buf.size();
|
|
start = _cache.erase(start);
|
|
}
|
|
_metrics.page_evictions += count;
|
|
return count;
|
|
}
|
|
public:
|
|
/// \brief Constructs a cached_file.
|
|
///
|
|
/// The cached area will reflect subset of f from the byte range [start, start + size).
|
|
///
|
|
/// \param m Metrics object which should be updated from operations on this object.
|
|
/// The metrics object can be shared by many cached_file instances, in which case it
|
|
/// will reflect the sum of operations on all cached_file instances.
|
|
cached_file(file f, reader_permit permit, cached_file::metrics& m, offset_type start, offset_type size, sstring file_name = {})
|
|
: _file(std::move(f))
|
|
, _file_name(std::move(file_name))
|
|
, _permit(std::move(permit))
|
|
, _metrics(m)
|
|
, _start(start)
|
|
, _size(size)
|
|
{
|
|
offset_type last_byte_offset = _start + (_size ? (_size - 1) : 0);
|
|
_last_page_size = (last_byte_offset % page_size) + (_size ? 1 : 0);
|
|
_last_page = last_byte_offset / page_size;
|
|
}
|
|
|
|
cached_file(cached_file&&) = default;
|
|
cached_file(const cached_file&) = delete;
|
|
|
|
~cached_file() {
|
|
evict_range(_cache.begin(), _cache.end());
|
|
}
|
|
|
|
/// \brief Populates cache from buf assuming that buf contains the data from the front of the area.
|
|
void populate_front(temporary_buffer<char> buf) {
|
|
// Align to page start. We can do this because the junk before _start won't be accessed.
|
|
auto pad = _start % page_size;
|
|
auto idx = _start / page_size;
|
|
buf = temporary_buffer<char>(buf.get_write() - pad, buf.size() + pad, buf.release());
|
|
|
|
while (buf.size() > page_size) {
|
|
auto page_buf = buf.share();
|
|
page_buf.trim(page_size);
|
|
++_metrics.page_populations;
|
|
_metrics.cached_bytes += page_buf.size();
|
|
_cache.emplace(idx, cached_page(std::move(page_buf)));
|
|
buf.trim_front(page_size);
|
|
++idx;
|
|
}
|
|
|
|
if (buf.size() == page_size || (idx == _last_page && buf.size() >= _last_page_size)) {
|
|
++_metrics.page_populations;
|
|
_metrics.cached_bytes += buf.size();
|
|
_cache.emplace(idx, cached_page(std::move(buf)));
|
|
}
|
|
}
|
|
|
|
/// \brief Invalidates [start, end) or less.
|
|
///
|
|
/// Invariants:
|
|
///
|
|
/// - all bytes outside [start, end) which were cached before the call will still be cached.
|
|
///
|
|
void invalidate_at_most(offset_type start, offset_type end, tracing::trace_state_ptr trace_state = {}) {
|
|
auto lo_page = (_start + start) / page_size
|
|
// If start is 0 then we can drop the containing page
|
|
// even if _start is not aligned to the page start.
|
|
// Otherwise we cannot drop the page.
|
|
+ bool((_start + start) % page_size) * bool(start != 0);
|
|
|
|
auto hi_page = (_start + end) / page_size;
|
|
|
|
if (lo_page < hi_page) {
|
|
auto count = evict_range(_cache.lower_bound(lo_page), _cache.lower_bound(hi_page));
|
|
if (count) {
|
|
tracing::trace(trace_state, "page cache: evicted {} page(s) in [{}, {}), file={}", count,
|
|
lo_page, hi_page, _file_name);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// \brief Equivalent to \ref invalidate_at_most(0, end).
|
|
void invalidate_at_most_front(offset_type end, tracing::trace_state_ptr trace_state = {}) {
|
|
auto hi_page = (_start + end) / page_size;
|
|
auto count = evict_range(_cache.begin(), _cache.lower_bound(hi_page));
|
|
if (count) {
|
|
tracing::trace(trace_state, "page cache: evicted {} page(s) in [0, {}), file={}", count,
|
|
hi_page, _file_name);
|
|
}
|
|
}
|
|
|
|
/// \brief Read from the file
|
|
///
|
|
/// Returns a stream with data which starts at position pos in the area managed by this instance.
|
|
/// This cached_file instance must outlive the returned stream.
|
|
/// The stream does not do any read-ahead.
|
|
///
|
|
/// \param pos The offset of the first byte to read, relative to the cached file area.
|
|
stream read(offset_type pos, const io_priority_class& pc, tracing::trace_state_ptr trace_state = {}) {
|
|
if (pos >= _size) {
|
|
return stream();
|
|
}
|
|
auto global_pos = _start + pos;
|
|
auto offset = global_pos % page_size;
|
|
auto page_idx = global_pos / page_size;
|
|
return stream(*this, pc, std::move(trace_state), page_idx, offset);
|
|
}
|
|
|
|
/// \brief Returns the number of bytes in the area managed by this instance.
|
|
offset_type size() const {
|
|
return _size;
|
|
}
|
|
|
|
/// \brief Returns the number of bytes cached.
|
|
size_t cached_bytes() const {
|
|
return _cache.size() * page_size;
|
|
}
|
|
};
|