/* * Copyright (C) 2018 ScyllaDB * */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ #include #include #include #include #include #include "symmetric_key.hh" #include "encryption.hh" #include "utils/serialization.hh" #include "encrypted_file_impl.hh" namespace encryption { using namespace seastar; static inline bool is_aligned(size_t n, size_t a) { return (n & (a - 1)) == 0; } /** * Very simple block-encrypting file wrapper. * * Uses user provided symmetric key + ESSIV block IV calculation * to encrypt data. * * The essiv block key is created by generating the SHA256 hash * of the provided data encryption key bytes, truncated to block_key_len/8 * and generating an AES/ECB key using this data. * * The file is divided in N blocks of `block_size` size. * Each block is encrypted (unpadded) with the provided key and * block mode, using an IV derived by (essiv): * * bytes tmp[] = { 0, ..., uint64_t-little-endian() } * iv = block_key->encrypt(tmp); * * All encryption is done unpadded. To handle file sizes we use * a slightly shaky scheme: * * Since all writes are assumed to be done by us, and must be aligned, * we can assume in turn that any resizing should be made by our truncate * method. If we attempt to truncate to a size not a multiple of our * _key_ block size (typically 16), we add the same size to the actual * truncation size. * On read we then check the file size. If we're reading from a file * with unaliged size, we know there are key-block-size junk at the end. * We can align down the last decryption call to match block size, then * discard the excessive bytes from the result. * * If we're in a read/write situation, we need to keep size updated, and * we could possibly race with disk op/continuations. * But we are really only for ro/wo cases. */ struct block_encryption_base { ::shared_ptr _key; ::shared_ptr _block_key; bytes _hash_salt; // this is somewhat large, but we assume this is for bulky stuff like sstables/commitlog // so large alignment should be preferable to reaclculating block IV too often. static constexpr size_t block_size = 4096; static constexpr size_t block_key_len = 256; using mode = symmetric_key::mode; bytes iv_for(uint64_t pos) const; static ::shared_ptr generate_block_key(::shared_ptr); block_encryption_base(::shared_ptr); }; class encrypted_file_impl : public seastar::file_impl, block_encryption_base { file _file; bytes _hash_salt; std::optional _file_length; class my_file_handle_impl; friend class my_file_handle_impl; temporary_buffer transform(uint64_t, const void* buffer, size_t len, mode); size_t transform(uint64_t, const void* buffer, size_t len, void*, mode); future<> verify_file_length(); void maybe_set_length(uint64_t); void clear_length(); public: encrypted_file_impl(file, ::shared_ptr); future write_dma(uint64_t pos, const void* buffer, size_t len, io_intent*) override; future write_dma(uint64_t pos, std::vector iov, io_intent*) override; future read_dma(uint64_t pos, void* buffer, size_t len, io_intent*) override; future read_dma(uint64_t pos, std::vector iov, io_intent*) override; future<> flush() override { return _file.flush(); } future stat(void) override; future<> truncate(uint64_t length) override; future<> discard(uint64_t offset, uint64_t length) override { return _file.discard(offset, length); } future<> allocate(uint64_t position, uint64_t length) override { return _file.allocate(position, length); } future size(void) override; future<> close() override { return _file.close(); } std::unique_ptr dup() override; subscription list_directory(std::function (directory_entry de)> next) override { return _file.list_directory(std::move(next)); } future> dma_read_bulk(uint64_t offset, size_t range_size, io_intent*) override; }; /** * Note: ESSIV block iv generation implementation. * See: http://securityevaluators.com/knowledge/papers/fde_whitepaper_draft_20170627.pdf * * We generate a key based on the sha256 of the data key, then encrypt each block number * using this to get per-block IV. * The key is AES-256, using ECB (non-iv) encryption * */ ::shared_ptr block_encryption_base::generate_block_key(::shared_ptr key) { auto hash = calculate_sha256(key->key()); hash.resize(block_key_len / 8); return ::make_shared(key_info{"AES/ECB", block_key_len }, hash); } block_encryption_base::block_encryption_base(::shared_ptr key) : _key(std::move(key)) , _block_key(generate_block_key(_key)) {} encrypted_file_impl::encrypted_file_impl(file f, ::shared_ptr key) : block_encryption_base(std::move(key)) , _file(std::move(f)) { _memory_dma_alignment = std::max(_file.memory_dma_alignment(), block_size); _disk_read_dma_alignment = std::max(_file.disk_read_dma_alignment(), block_size); _disk_write_dma_alignment = std::max(_file.disk_write_dma_alignment(), block_size); } static future calculate_file_length(const file& f, size_t key_block_size) { return f.size().then([key_block_size](uint64_t s) { if (!is_aligned(s, key_block_size)) { if (s < key_block_size) { throw std::domain_error(fmt::format("file size {}, expected 0 or at least {}", s, key_block_size)); } s -= key_block_size; } return s; }); } future<> encrypted_file_impl::verify_file_length() { if (_file_length) { return make_ready_future(); } return calculate_file_length(_file, _key->block_size()).then([this](uint64_t s) { _file_length = s; }); } void encrypted_file_impl::maybe_set_length(uint64_t s) { if (s > _file_length.value_or(0)) { _file_length = s; } } void encrypted_file_impl::clear_length() { _file_length = std::nullopt; } bytes block_encryption_base::iv_for(uint64_t pos) const { assert(!(pos & (block_size - 1))); // #658. ECB block mode has no IV. Bad for security, // but must handle. size_t iv_len = _key->iv_len(); if (iv_len == 0) { return bytes{}; } assert(iv_len >= _key->block_size()); assert(iv_len >= sizeof(uint64_t)); bytes b(bytes::initialized_later(), std::max(iv_len, _block_key->block_size())); std::fill(b.begin(), b.end() - sizeof(uint64_t), 0); // write block pos as little endian IV-len integer auto block = pos / block_size; write_le(reinterpret_cast(b.end()) - sizeof(uint64_t), block); // encrypt the encoded block number to build an IV _block_key->encrypt_unpadded(b.data(), b.size(), b.data()); b.resize(iv_len); return b; } size_t encrypted_file_impl::transform(uint64_t pos, const void* buffer, size_t len, void* dst, mode m) { assert(!(pos & (block_size - 1))); assert(_file_length || m == mode::encrypt); auto o = reinterpret_cast(dst); auto i = reinterpret_cast(buffer); auto l = _file_length.value_or(std::numeric_limits::max()); auto b = _key->block_size(); size_t off = 0; for (; off < len; off += block_size) { auto iv = iv_for(pos + off); auto rem = std::min(block_size, len - off); if (rem < block_size || ((pos + off + rem) > l && m == symmetric_key::mode::decrypt)) { // truncated block. should be the last one. if (m != symmetric_key::mode::decrypt) { throw std::invalid_argument("Output data not aligned"); } _key->transform_unpadded(m, i + off, align_down(rem, b), o + off, iv.data()); // #22236 - ensure we don't wrap numbers here. // If reading past actual end of file (_file_length), we can be decoding // 1- bytes here, that are at the boundary of last // (fake) block of the file. // Example: // File data size: 4095 bytes // Physical file size: 4095 + 16 (assume 16 bytes key block) // Read 0:4096 -> 4095 bytes // If caller now ignores this and just reads 4096 (or more) // bytes at next block (4096), we read 15 bytes and decode. // But would be past _file_length -> ensure we return zero here. return std::max(l, pos) - pos; } _key->transform_unpadded(m, i + off, block_size, o + off, iv.data()); } return off; } temporary_buffer encrypted_file_impl::transform(uint64_t pos, const void* buffer, size_t len, mode m) { assert(!(len & (block_size - 1))); auto tmp = temporary_buffer::aligned(_file.memory_dma_alignment(), len); auto s = transform(pos, buffer, len, tmp.get_write(), m); tmp.trim(s); return tmp; } future encrypted_file_impl::write_dma(uint64_t pos, const void* buffer, size_t len, io_intent* intent) { assert(!(len & (block_size - 1))); auto tmp = transform(pos, buffer, len, mode::encrypt); assert(tmp.size() == len); // writing auto p = tmp.get(); return _file.dma_write(pos, p, len, intent).then([this, tmp = std::move(tmp), pos](size_t s) { maybe_set_length(pos + s); return s; }); } future encrypted_file_impl::write_dma(uint64_t pos, std::vector iov, io_intent* intent) { std::vector> tmp; tmp.reserve(iov.size()); size_t n = 0; for (auto& i : iov) { assert(!(i.iov_len & (block_size - 1))); tmp.emplace_back(transform(pos + n, i.iov_base, i.iov_len, mode::encrypt)); assert(tmp.back().size() == i.iov_len); // writing n += i.iov_len; i = iovec{ tmp.back().get_write(), tmp.back().size() }; } return _file.dma_write(pos, std::move(iov), intent).then([this, tmp = std::move(tmp), pos](size_t s) { maybe_set_length(pos + s); return s; }); } future encrypted_file_impl::read_dma(uint64_t pos, void* buffer, size_t len, io_intent* intent) { assert(!(len & (block_size - 1))); return verify_file_length().then([this, pos, buffer, len, intent] { if (pos >= *_file_length) { return make_ready_future(0); } return _file.dma_read(pos, buffer, len, intent).then([this, pos, buffer](size_t len) { return transform(pos, buffer, len, buffer, mode::decrypt); }); }); } future encrypted_file_impl::read_dma(uint64_t pos, std::vector iov, io_intent* intent) { return verify_file_length().then([this, pos, iov = std::move(iov), intent]() mutable { if (pos >= *_file_length) { return make_ready_future(0); } auto f = _file.dma_read(pos, iov, intent); return f.then([this, pos, iov = std::move(iov)](size_t len) mutable { size_t off = 0; for (auto& i : iov) { off += transform(pos + off, i.iov_base, i.iov_len, i.iov_base, mode::decrypt); } return off; }); }); } future> encrypted_file_impl::dma_read_bulk(uint64_t offset, size_t range_size, io_intent* intent) { return verify_file_length().then([this, offset, range_size, intent]() mutable { if (offset >= *_file_length) { return make_ready_future>(); } auto front = offset & (block_size - 1); offset -= front; range_size += front; // enterprise #925 // If caller is clever and asks for the last chunk of file // explicitly (as in offset = N, range_size = size() - N), // or any other unaligned size, we need to add enough padding // to get the actual full block to decode. auto block_size = align_up(range_size, _key->block_size()); return _file.dma_read_bulk(offset, block_size, intent).then([this, offset, front, range_size](temporary_buffer result) { auto s = transform(offset, result.get(), result.size(), result.get_write(), mode::decrypt); // never give back more than asked for. result.trim(std::min(s, range_size)); // #22236 - ensure we don't overtrim if we get a short read. result.trim_front(std::min(front, result.size())); return result; }); }); } future<> encrypted_file_impl::truncate(uint64_t length) { return size().then([this, length](uint64_t s) { if (s >= length) { auto kb = _key->block_size(); auto n = length; if (!is_aligned(length, kb)) { n += kb; } return _file.truncate(n).then([this, length] { _file_length = length; }); } // crap. we need to pad zeros. But zeros here means // encrypted zeros. So we must do this surprisingly // expensively, by actually writing said zeros block // by block. Anyone hoping for sparse files is now // severely disappointed! auto buf_size = align_up(std::min(length, 32 * block_size), block_size); auto aligned_size = align_down(s, block_size); temporary_buffer buf(buf_size); std::fill(buf.get_write(), buf.get_write() + buf_size, 0); struct trunc { temporary_buffer buf; uint64_t aligned_size; uint64_t size; uint64_t length; }; return do_with(trunc{std::move(buf), aligned_size, s, length}, [this](trunc & t) { return repeat([this, &t] { if (t.aligned_size >= t.length) { return make_ready_future(stop_iteration::yes); } auto n = std::min(t.buf.size(), align_up(size_t(t.length - t.aligned_size), block_size)); if (t.aligned_size < t.size) { return read_dma(t.aligned_size, t.buf.get_write(), n, nullptr).then([&, n](size_t r) mutable { auto rem = size_t(t.size - t.aligned_size); auto ar = align_up(r, block_size); assert(ar <= t.buf.size()); if (rem < ar) { std::fill(t.buf.get_write() + rem, t.buf.get_write() + ar, 0); } return write_dma(t.aligned_size, t.buf.get(), ar, nullptr).then([&, n](size_t w) { t.aligned_size += w; // #1869. On btrfs, we get the buffer potentially clobbered up to "n" (max read amount) // even when "r" (actual bytes read) is less. std::fill(t.buf.get_write(), t.buf.get_write() + n, 0); return make_ready_future(stop_iteration::no); }); }); } return write_dma(t.aligned_size, t.buf.get(), n, nullptr).then([&](size_t w) { t.aligned_size += w; return make_ready_future(stop_iteration::no); }); }); }).then([this, length] { return truncate(length); });; }); } future encrypted_file_impl::stat() { return _file.stat().then([this](struct stat s) { return verify_file_length().then([this, s]() mutable { s.st_size = *_file_length; return s; }); }); } future encrypted_file_impl::size() { return verify_file_length().then([this] { return *_file_length; }); } std::unique_ptr encrypted_file_impl::dup() { class my_file_handle_impl : public seastar::file_handle_impl { seastar::file_handle _handle; key_info _info; bytes _key; public: my_file_handle_impl(seastar::file_handle h, const key_info& info, const bytes& key) : _handle(std::move(h)) , _info(info) , _key(key) {} std::unique_ptr clone() const override { return std::make_unique(_handle, _info, _key); } seastar::shared_ptr to_file() && override { return seastar::make_shared(_handle.to_file(), ::make_shared(_info, _key)); } }; return std::make_unique(_file.dup(), _key->info(), _key->key()); } shared_ptr make_encrypted_file(file f, ::shared_ptr k) { return ::make_shared(std::move(f), std::move(k)); } class indirect_encrypted_file_impl : public file_impl { ::shared_ptr _impl; file _f; size_t _key_block_size; get_key_func _get; future<> get() { if (_impl) { return make_ready_future<>(); } return _get().then([this](::shared_ptr k) { // #978 could be running the getting more than once. // Only write _impl once though if (!_impl) { _impl = make_encrypted_file(_f, std::move(k)); } }); } public: indirect_encrypted_file_impl(file f, size_t key_block_size, get_key_func get) : _f(f), _key_block_size(key_block_size), _get(std::move(get)) {} future write_dma(uint64_t pos, const void* buffer, size_t len, io_intent* intent) override { return get().then([this, pos, buffer, len, intent]() { return _impl->write_dma(pos, buffer, len, intent); }); } future write_dma(uint64_t pos, std::vector iov, io_intent* intent) override { return get().then([this, pos, iov = std::move(iov), intent]() mutable { return _impl->write_dma(pos, std::move(iov), intent); }); } future read_dma(uint64_t pos, void* buffer, size_t len, io_intent* intent) override { return get().then([this, pos, buffer, len, intent]() { return _impl->read_dma(pos, buffer, len, intent); }); } future read_dma(uint64_t pos, std::vector iov, io_intent* intent) override { return get().then([this, pos, iov = std::move(iov), intent]() mutable { return _impl->read_dma(pos, std::move(iov), intent); }); } future> dma_read_bulk(uint64_t offset, size_t range_size, io_intent* intent) override { return get().then([this, offset, range_size, intent]() { return _impl->dma_read_bulk(offset, range_size, intent); }); } future<> flush(void) override { if (_impl) { return _impl->flush(); } return _f.flush(); } future stat(void) override { if (_impl) { return _impl->stat(); } return _f.stat().then([this](struct stat s) { return calculate_file_length(_f, _key_block_size).then([s](uint64_t fs) mutable { s.st_size = fs; return s; }); }); } future<> truncate(uint64_t length) override { if (_impl) { return _impl->truncate(length); } return _f.truncate(length); } future<> discard(uint64_t offset, uint64_t length) override { if (_impl) { return _impl->discard(offset, length); } return _f.discard(offset, length); } future<> allocate(uint64_t position, uint64_t length) override { if (_impl) { return _impl->allocate(position, length); } return _f.allocate(position, length); } future size(void) override { if (_impl) { return _impl->size(); } return calculate_file_length(_f, _key_block_size); } future<> close() override { if (_impl) { return _impl->close(); } return _f.close(); } std::unique_ptr dup() override { if (_impl) { return _impl->dup(); } class my_file_handle_impl : public seastar::file_handle_impl { seastar::file_handle _handle; size_t _key_block_size; get_key_func _get; public: my_file_handle_impl(seastar::file_handle h, size_t key_block_size, get_key_func get) : _handle(std::move(h)) , _key_block_size(key_block_size) , _get(std::move(get)) {} std::unique_ptr clone() const override { return std::make_unique(_handle, _key_block_size, _get); } seastar::shared_ptr to_file() && override { return make_delayed_encrypted_file(_handle.to_file(), _key_block_size, _get); } }; return std::make_unique(_f.dup(), _key_block_size, _get); } subscription list_directory(std::function (directory_entry de)> next) override { if (_impl) { return _impl->list_directory(std::move(next)); } return _f.list_directory(std::move(next)); } }; shared_ptr make_delayed_encrypted_file(file f, size_t key_block_size, get_key_func get) { return ::make_shared(std::move(f), key_block_size, std::move(get)); } class encrypted_data_sink : public data_sink_impl, public block_encryption_base { data_sink _sink; char _buffer[block_size]; uint64_t _pos = 0; size_t _bufpos = 0; using mode = block_encryption_base::mode; void transform(uint64_t pos, char* data, size_t len) { assert(!(pos & (block_size - 1))); auto b = _key->block_size(); size_t off = 0; for (; off < len; off += block_size) { auto iv = iv_for(_pos + off); auto rem = std::min(block_size, len - off); _key->transform_unpadded(mode::encrypt, data + off, align_down(rem, b), data + off, iv.data()); } } future<> do_send(temporary_buffer buf) { _pos += buf.size(); return _sink.put(std::move(buf)); } future<> send_buffer() { return do_send(temporary_buffer(_buffer, std::exchange(_bufpos, 0))); } public: encrypted_data_sink(data_sink sink, shared_ptr k) : block_encryption_base(std::move(k)) , _sink(std::move(sink)) {} future<> put(std::span> bufs) override { return data_sink_impl::fallback_put(bufs, [this] (temporary_buffer&& buf) { return do_put(std::move(buf)); }); } private: future<> do_put(temporary_buffer buf) { if (_bufpos != 0) { auto add = std::min(buf.size(), sizeof(_buffer) - _bufpos); std::copy(buf.get(), buf.get() + add, _buffer + _bufpos); _bufpos += add; buf.trim_front(add); if (_bufpos == sizeof(_buffer)) { transform(_pos, _buffer, _bufpos); co_await send_buffer(); } } assert(!(_pos & (block_size - 1))); while (buf.size() >= block_size) { auto size = align_down(buf.size(), block_size); transform(_pos, buf.get_write(), size); co_await do_send(buf.share(0, size)); buf.trim_front(size); } if (!buf.empty()) { assert(_bufpos == 0); assert(buf.size() < sizeof(_buffer)); std::copy(buf.begin(), buf.end(), _buffer); _bufpos = buf.size(); } } public: future<> flush() override { /* if (std::exchange(_flush_pos, _pos) < _pos) { return _sink.flush(); } */ return make_ready_future<>(); } future<> close() override { auto kb = _key->block_size(); size_t append = 0; if (!is_aligned(_bufpos, kb)) { auto add = std::min(kb, sizeof(_buffer) - _bufpos); std::fill(_buffer + _bufpos, _buffer + _bufpos + add, 0); _bufpos += add; append = kb - add; } if (_bufpos != 0) { transform(_pos, _buffer, _bufpos); co_await send_buffer(); } if (append != 0) { co_await do_send(temporary_buffer(append)); } co_await _sink.flush(); co_await _sink.close(); } size_t buffer_size() const noexcept override { return align_up(_sink.buffer_size(), block_size); } bool can_batch_flushes() const noexcept override { return _sink.can_batch_flushes(); } void on_batch_flush_error() noexcept override { _sink.on_batch_flush_error(); } }; std::unique_ptr make_encrypted_sink(data_sink sink, shared_ptr k) { return std::make_unique(std::move(sink), std::move(k)); } class encrypted_data_source : public data_source_impl, public block_encryption_base { input_stream _input; temporary_buffer _next; size_t _current_position = 0; size_t _skip = 0; public: encrypted_data_source(data_source source, shared_ptr k) : block_encryption_base(std::move(k)) , _input(std::move(source)) {} future> get() override { // First, get as much as we can get now (or the remainder of previous call) auto buf1 = _next.empty() ? co_await _input.read() : std::exchange(_next, {}) ; // eof? if (buf1.empty()) { co_return buf1; } // now we need one page more to be able to save one for next lap auto fill_size = align_up(buf1.size(), block_size) + block_size - buf1.size(); // If the underlying stream is already at EOF (e.g. buf1 came from // cached _next while the previous read_exactly drained the source), // skip the read_exactly call — it would return empty anyway. auto buf2 = _input.eof() ? temporary_buffer() : co_await _input.read_exactly(fill_size); temporary_buffer output(buf1.size() + buf2.size()); // we copy data even for the part we will cache. this to // fix block alignment of the resulting shared buffer. std::copy(buf1.begin(), buf1.end(), output.get_write()); std::copy(buf2.begin(), buf2.end(), output.get_write() + buf1.size()); // we need to keep one page buffered (beyond the input stream buffer - would be neat // to share it), to be able to detect actual eof stream size. We always need // at least _two_ pages of data to process, to be able to handle the case where // actual size is - . // I.e. stream size = 8180, encrypted data size will be 8192, and data stream // will be 8196. So we need to make sure buf1 == [4096-8192], and buf2 == [8192-8196]. if (is_aligned(output.size(), block_size) && output.size() >= 2*block_size) { _next = output.share(output.size() - block_size, block_size); output.trim(output.size() - block_size); } const size_t key_block_size = _key->block_size(); // decrypt all blocks we have to return. might include the last, partial block for (size_t offset = 0; offset < output.size(); offset += block_size, _current_position += block_size) { auto iv = iv_for(_current_position); auto rem = std::min(block_size, output.size() - offset); _key->transform_unpadded(mode::decrypt, output.get() + offset, align_down(rem, key_block_size), output.get_write() + offset, iv.data()); } // now, if the output buffer is not aligned, we are at eof, and // also need to trim result. if (!is_aligned(output.size(), key_block_size)) { output.trim(output.size() - std::min(output.size(), key_block_size)); } assert(is_aligned(_current_position, block_size)); // finally trim front to handle any skip remainders output.trim_front(std::min(std::exchange(_skip, 0), output.size())); co_return output; } future> skip(uint64_t n) override { if (n >= block_size) { // since we only give back data aligned to block_size chunks, // a client would only ever skip from a block boundary. auto to_skip = align_down(n, block_size); assert(is_aligned(_next.size(), block_size)); co_await _input.skip(to_skip - _next.size()); n -= to_skip; _current_position += to_skip; _next = {}; } _skip = n; co_return temporary_buffer{}; } future<> close() override { return _input.close(); } }; std::unique_ptr make_encrypted_source(data_source source, shared_ptr k) { return std::make_unique(std::move(source), std::move(k)); } }