From ed043e57622231a82217ab6a132d7fc88c1fa968 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 14:20:45 +0300 Subject: [PATCH 01/10] sstable_directory: Split delete_with_pending_deletion_log() The helper consists of three parts -- prepare the deletion log, unlink sstables and drop the deletion log. For testing the first part is needed as a separate step, so here's this split. It renders two nested async contexts, but it will change soon. Signed-off-by: Pavel Emelyanov --- sstables/sstable_directory.cc | 15 +++++++++++---- sstables/sstable_directory.hh | 4 ++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/sstables/sstable_directory.cc b/sstables/sstable_directory.cc index 5b0d1d9340..ea831b44a7 100644 --- a/sstables/sstable_directory.cc +++ b/sstables/sstable_directory.cc @@ -545,8 +545,8 @@ bool sstable_directory::compare_sstable_storage_prefix(const sstring& prefix_a, return size_a == size_b && sstring::traits_type::compare(prefix_a.begin(), prefix_b.begin(), size_a) == 0; } -future<> sstable_directory::delete_with_pending_deletion_log(std::vector ssts) { - return seastar::async([ssts = std::move(ssts)] { +future> sstable_directory::create_pending_deletion_log(const std::vector& ssts) { + return seastar::async([&ssts] { shared_sstable first = nullptr; min_max_tracker gen_tracker; @@ -601,11 +601,18 @@ future<> sstable_directory::delete_with_pending_deletion_log(std::vector(std::move(pending_delete_log), first->_storage->prefix()); + }); +} + +future<> sstable_directory::delete_with_pending_deletion_log(std::vector ssts) { + return seastar::async([ssts = std::move(ssts)] { + auto [ pending_delete_log, sst_directory] = create_pending_deletion_log(ssts).get0(); + parallel_for_each(ssts, [] (shared_sstable sst) { return sst->unlink(sstables::storage::sync_dir::no); }).get(); - - sync_directory(first->_storage->prefix()).get(); + sync_directory(sst_directory).get(); // Once all sstables are deleted, the log file can be removed. // Note: the log file will be removed also if unlink failed to remove diff --git a/sstables/sstable_directory.hh b/sstables/sstable_directory.hh index 6613c18683..18b6d1df42 100644 --- a/sstables/sstable_directory.hh +++ b/sstables/sstable_directory.hh @@ -277,6 +277,10 @@ public: // // This function only solves the second problem for now. static future<> delete_with_pending_deletion_log(std::vector ssts); + // Creates the deletion log for atomic deletion of sstables (helper for the + // above function that's also used by tests) + // Returns a pair of "logilfe name" and "directory with sstables" + static future> create_pending_deletion_log(const std::vector& ssts); static bool compare_sstable_storage_prefix(const sstring& a, const sstring& b) noexcept; }; From 92f0aa04d0a0730b783c411ea357bfce2de82027 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 17:39:00 +0300 Subject: [PATCH 02/10] test: Sstable on_delete() is not necessarily in a thread One of the test cases injects an observer into sstable->unlink() method via its _on_delete() callback. The test's callback assumes that it runs in an async context, but it's a happy coincidence, because deletion via the deletion log runs so. Next patch is changing it and the test case will no longer work. But since it's a test case it can just directly call a libc function for its needs Signed-off-by: Pavel Emelyanov --- test/boost/sstable_compaction_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/boost/sstable_compaction_test.cc b/test/boost/sstable_compaction_test.cc index 49d503d228..b68ce4e69a 100644 --- a/test/boost/sstable_compaction_test.cc +++ b/test/boost/sstable_compaction_test.cc @@ -5200,7 +5200,8 @@ SEASTAR_TEST_CASE(cleanup_during_offstrategy_incremental_compaction_test) { sstables_closed++; })); observers.push_back(sst->add_on_delete_handler([&] (sstable& sst) mutable { - auto missing = !file_exists(sst.get_filename()).get(); + // ATTN -- the _on_delete callback is not necessarily running in thread + auto missing = (::access(sst.get_filename().c_str(), F_OK) != 0); testlog.info("Deleting sstable of generation {}: missing={}", sst.generation(), missing); sstables_missing_on_delete += missing; })); From 28b1289d4be831e869c08304f9d1504e0d05004d Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 14:21:22 +0300 Subject: [PATCH 03/10] sstable_directory: Coroutinize delete_with_pending_deletion_log() Signed-off-by: Pavel Emelyanov --- sstables/sstable_directory.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sstables/sstable_directory.cc b/sstables/sstable_directory.cc index ea831b44a7..2532888ec6 100644 --- a/sstables/sstable_directory.cc +++ b/sstables/sstable_directory.cc @@ -606,24 +606,22 @@ future> sstable_directory::create_pending_deletion_l } future<> sstable_directory::delete_with_pending_deletion_log(std::vector ssts) { - return seastar::async([ssts = std::move(ssts)] { - auto [ pending_delete_log, sst_directory] = create_pending_deletion_log(ssts).get0(); + auto [ pending_delete_log, sst_directory] = co_await create_pending_deletion_log(ssts); - parallel_for_each(ssts, [] (shared_sstable sst) { + co_await coroutine::parallel_for_each(ssts, [] (shared_sstable sst) { return sst->unlink(sstables::storage::sync_dir::no); - }).get(); - sync_directory(sst_directory).get(); + }); + co_await sync_directory(sst_directory); // Once all sstables are deleted, the log file can be removed. // Note: the log file will be removed also if unlink failed to remove // any sstable and ignored the error. try { - remove_file(pending_delete_log).get(); + co_await remove_file(pending_delete_log); sstlog.debug("{} removed.", pending_delete_log); } catch (...) { sstlog.warn("Error removing {}: {}. Ignoring.", pending_delete_log, std::current_exception()); } - }); } // FIXME: Go through maybe_delete_large_partitions_entry on recovery since From bb167dcca55f0cf4f18133d788ed585a4bc30e13 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 14:23:51 +0300 Subject: [PATCH 04/10] sstable_directory: Fix indentation after previous patch Signed-off-by: Pavel Emelyanov --- sstables/sstable_directory.cc | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sstables/sstable_directory.cc b/sstables/sstable_directory.cc index 2532888ec6..6d9a6bffc0 100644 --- a/sstables/sstable_directory.cc +++ b/sstables/sstable_directory.cc @@ -606,22 +606,22 @@ future> sstable_directory::create_pending_deletion_l } future<> sstable_directory::delete_with_pending_deletion_log(std::vector ssts) { - auto [ pending_delete_log, sst_directory] = co_await create_pending_deletion_log(ssts); + auto [ pending_delete_log, sst_directory] = co_await create_pending_deletion_log(ssts); - co_await coroutine::parallel_for_each(ssts, [] (shared_sstable sst) { - return sst->unlink(sstables::storage::sync_dir::no); - }); - co_await sync_directory(sst_directory); + co_await coroutine::parallel_for_each(ssts, [] (shared_sstable sst) { + return sst->unlink(sstables::storage::sync_dir::no); + }); + co_await sync_directory(sst_directory); - // Once all sstables are deleted, the log file can be removed. - // Note: the log file will be removed also if unlink failed to remove - // any sstable and ignored the error. - try { - co_await remove_file(pending_delete_log); - sstlog.debug("{} removed.", pending_delete_log); - } catch (...) { - sstlog.warn("Error removing {}: {}. Ignoring.", pending_delete_log, std::current_exception()); - } + // Once all sstables are deleted, the log file can be removed. + // Note: the log file will be removed also if unlink failed to remove + // any sstable and ignored the error. + try { + co_await remove_file(pending_delete_log); + sstlog.debug("{} removed.", pending_delete_log); + } catch (...) { + sstlog.warn("Error removing {}: {}. Ignoring.", pending_delete_log, std::current_exception()); + } } // FIXME: Go through maybe_delete_large_partitions_entry on recovery since From fcf080b63bf16ca51a610d720954de43562b9249 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 14:24:00 +0300 Subject: [PATCH 05/10] sstable_directory: Close dir on exception When committing the deletion log creation its containing directory is sync-ed via opened file. This place is not exception safe and directory can be left unclosed Signed-off-by: Pavel Emelyanov --- sstables/sstable_directory.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sstables/sstable_directory.cc b/sstables/sstable_directory.cc index 6d9a6bffc0..f9dce37e30 100644 --- a/sstables/sstable_directory.cc +++ b/sstables/sstable_directory.cc @@ -590,12 +590,13 @@ future> sstable_directory::create_pending_deletion_l close_out.close_now(); auto dir_f = open_directory(pending_delete_dir).get0(); + auto close_dir = deferred_close(dir_f); // Once flushed and closed, the temporary log file can be renamed. rename_file(tmp_pending_delete_log, pending_delete_log).get(); // Guarantee that the changes above reached the disk. dir_f.flush().get(); - dir_f.close().get(); + close_dir.close_now(); sstlog.debug("{} written successfully.", pending_delete_log); } catch (...) { sstlog.warn("Error while writing {}: {}. Ignoring.", pending_delete_log, std::current_exception()); From b10ca96e07cafccfdfd970b3e1e0273c01738d3e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 14:01:30 +0300 Subject: [PATCH 06/10] test: Add test case to validate deletion log work The test sequence is - create several sstables - create deletion log for a sub-set of them - partially unlink smaller sub-sub-set - make sstable directory do the processing with g.c. - check that the sstables loaded do NOT include the deleted ones The .throw_on_missing_toc bit set additionally validates that the directory doesn't contain garbage not attached to any other TOCs Signed-off-by: Pavel Emelyanov --- test/boost/sstable_directory_test.cc | 50 ++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/test/boost/sstable_directory_test.cc b/test/boost/sstable_directory_test.cc index 1ebdaebb3c..d043301577 100644 --- a/test/boost/sstable_directory_test.cc +++ b/test/boost/sstable_directory_test.cc @@ -778,3 +778,53 @@ SEASTAR_THREAD_TEST_CASE(test_system_datadir_layout) { BOOST_REQUIRE(!file_exists(tbl_dirname.native()).get()); }, cfg).get(); } + +SEASTAR_TEST_CASE(test_pending_log_garbage_collection) { + return sstables::test_env::do_with_sharded_async([] (auto& env) { + std::vector ssts_to_keep; + for (int i = 0; i < 2; i++) { + ssts_to_keep.emplace_back(make_sstable_for_this_shard(std::bind(new_env_sstable, std::ref(env.local())))); + } + std::vector ssts_to_remove; + for (int i = 0; i < 3; i++) { + ssts_to_remove.emplace_back(make_sstable_for_this_shard(std::bind(new_env_sstable, std::ref(env.local())))); + } + + // Now start atomic deletion -- create the pending deletion log for all + // three sstables, move TOC file for one of them into temporary-TOC, and + // partially delete another + sstable_directory::create_pending_deletion_log(ssts_to_remove).get0(); + rename_file(test(ssts_to_remove[1]).filename(sstables::component_type::TOC).native(), test(ssts_to_remove[1]).filename(sstables::component_type::TemporaryTOC).native()).get(); + rename_file(test(ssts_to_remove[2]).filename(sstables::component_type::TOC).native(), test(ssts_to_remove[2]).filename(sstables::component_type::TemporaryTOC).native()).get(); + remove_file(test(ssts_to_remove[2]).filename(sstables::component_type::Data).native()).get(); + + with_sstable_directory(env, [&] (sharded& sstdir) { + auto expect_ok = distributed_loader_for_tests::process_sstable_dir(sstdir, { .throw_on_missing_toc = true, .garbage_collect = true }); + BOOST_REQUIRE_NO_THROW(expect_ok.get()); + + auto collected = sstdir.map_reduce0( + [] (auto& sstdir) { + return do_with(std::set(), [&sstdir] (auto& gens) { + return sstdir.do_for_each_sstable([&] (const shared_sstable& sst) { + gens.emplace(sst->generation()); + return make_ready_future<>(); + }).then([&gens] () mutable -> future> { + return make_ready_future>(std::move(gens));; + }); + }); + }, std::set(), + [] (auto&& res, auto&& gens) { + res.merge(gens); + return std::move(res); + } + ).get(); + + std::set expected; + for (auto& sst : ssts_to_keep) { + expected.insert(sst->generation()); + } + + BOOST_REQUIRE_EQUAL(expected, collected); + }); + }); +} From 5ff59465206d201cc6f2157745c84042d21e9c25 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 16:36:32 +0300 Subject: [PATCH 07/10] sstables: Split remove_by_toc_name() The helper consists of three phases: - move TOC -> TOC.tmp - remove components listed in TOC - remove TOC.tmp The first step is needed separately by the next patch Signed-off-by: Pavel Emelyanov --- sstables/sstables.cc | 15 +++++++++++++-- sstables/sstables.hh | 5 +++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sstables/sstables.cc b/sstables/sstables.cc index deac9e4c80..be0cd53465 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -2601,7 +2601,7 @@ std::optional> sstable::get_sample_indexes_for_ran return std::nullopt; } -future<> remove_by_toc_name(sstring sstable_toc_name, storage::sync_dir sync) { +future make_toc_temporary(sstring sstable_toc_name, storage::sync_dir sync) { sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - sstable_version_constants::TOC_SUFFIX.size()); sstring new_toc_name = prefix + sstable_version_constants::TEMPORARY_TOC_SUFFIX; @@ -2615,9 +2615,20 @@ future<> remove_by_toc_name(sstring sstable_toc_name, storage::sync_dir sync) { } else { if (!co_await sstable_io_check(sstable_write_error_handler, file_exists, new_toc_name)) { sstlog.warn("Unable to delete {} because it doesn't exist.", sstable_toc_name); - co_return; + co_return ""; } } + co_return new_toc_name; +} + +future<> remove_by_toc_name(sstring sstable_toc_name, storage::sync_dir sync) { + sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - sstable_version_constants::TOC_SUFFIX.size()); + + auto new_toc_name = co_await make_toc_temporary(sstable_toc_name, sync); + if (new_toc_name.empty()) { + co_return; + } + auto toc_file = co_await open_checked_file_dma(sstable_write_error_handler, new_toc_name, open_flags::ro); std::vector components = co_await with_closeable(std::move(toc_file), [] (file& toc_file) { return sstable::read_and_parse_toc(toc_file); diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 2991c83f6d..7d67891b4b 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -1021,5 +1021,10 @@ future<> remove_table_directory_if_has_no_snapshots(fs::path table_dir); // Caller may pass sync_dir::no for batching multiple deletes in the same directory, // and make sure the directory is sync'ed on or after the last call. future<> remove_by_toc_name(sstring sstable_toc_name, storage::sync_dir sync = storage::sync_dir::yes); +// makes sure the TOC file is temporary by moving existing TOC file or otherwise +// checking the temporary-TOC already exists +// resolves into temporary-TOC file name or empty string if neither TOC nor temp. +// TOC is there +future make_toc_temporary(sstring sstable_toc_name, storage::sync_dir sync = storage::sync_dir::yes); } // namespace sstables From de931702ec5fcca013e62a864587833945c39766 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 16:36:46 +0300 Subject: [PATCH 08/10] sstable_directory: Enlight deletion log replay Garbage collection of sstables is scattered between two strages -- g.c. per-se and the regular processing. The former stage collects deletion logs and for each log found goes ahead and deletes the full sstable with the standard sequence: - move TOC -> TOC.tmp - remove components - remove TOC.tmp The latter stage picks up partially unlinked sstables that didn't go via atomic deletion with the log. This comes as - collect all components - keep TOC's and TOC.tmp's in separate lists - attach other components to TOC/TOC.tmp by generation value - for all TOC.tmp's get all attached components and remove them - continue loading TOC's with attached components Said that, replaying deletion log can be as light as just the first step out of the above sequence -- just move TOC to TOC.tmp. After that the regular processing would pick the remaining components and clean them Signed-off-by: Pavel Emelyanov --- sstables/sstable_directory.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sstables/sstable_directory.cc b/sstables/sstable_directory.cc index f9dce37e30..5b04b45950 100644 --- a/sstables/sstable_directory.cc +++ b/sstables/sstable_directory.cc @@ -639,7 +639,8 @@ future<> sstable_directory::filesystem_components_lister::replay_pending_delete_ boost::split(basenames, all, boost::is_any_of("\n"), boost::token_compress_on); auto tocs = boost::copy_range>(basenames | boost::adaptors::filtered([] (auto&& basename) { return !basename.empty(); })); co_await parallel_for_each(tocs, [&sstdir] (const sstring& name) { - return remove_by_toc_name(sstdir + "/" + name); + // Only move TOC to TOC.tmp, the rest will be finished by regular process + return make_toc_temporary(sstdir + "/" + name).discard_result(); }); sstlog.debug("Replayed {}, removing", pending_delete_log); co_await remove_file(pending_delete_log.native()); From 4405a625f65269703b3f0b82506ebfbbb2851426 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 16:52:21 +0300 Subject: [PATCH 09/10] sstables/fs_storage: Wipe by recognized+unrecognized components Currently wiping fs-backed sstable happens via reading and parsing its TOC file back. Then the three-step process goes: - move TOC -> TOC.tmp - remove components (obtained from TOC.tmp) - remove TOC.tmp However, wiping sstable happens in one of two cases -- the sstable was loaded from the TOC file _or_ sstable had evaluated the needed components and generated TOC file. With that, the 2nd step can be made without reading the TOC file, just by looking at all components sitting on the sstable Signed-off-by: Pavel Emelyanov --- sstables/storage.cc | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/sstables/storage.cc b/sstables/storage.cc index 3c4a7cd047..cfa1911150 100644 --- a/sstables/storage.cc +++ b/sstables/storage.cc @@ -399,6 +399,10 @@ future<> filesystem_storage::change_state(const sstable& sst, sstable_state stat co_await move(sst, path.native(), std::move(new_generation), delay_commit); } +static inline fs::path parent_path(const sstring& fname) { + return fs::canonical(fs::path(fname)).parent_path(); +} + future<> filesystem_storage::wipe(const sstable& sst, sync_dir sync) noexcept { // We must be able to generate toc_filename() // in order to delete the sstable. @@ -409,7 +413,31 @@ future<> filesystem_storage::wipe(const sstable& sst, sync_dir sync) noexcept { }(); try { - co_await remove_by_toc_name(name, sync); + auto new_toc_name = co_await make_toc_temporary(name, sync); + if (!new_toc_name.empty()) { + auto dir_name = parent_path(new_toc_name); + + co_await coroutine::parallel_for_each(sst.all_components(), [&sst, &dir_name] (auto component) -> future<> { + if (component.first == component_type::TOC) { + // already renamed + co_return; + } + + auto fname = sstable::filename(dir_name.native(), sst._schema->ks_name(), sst._schema->cf_name(), sst._version, sst._generation, sst._format, component.second); + try { + co_await sst.sstable_write_io_check(remove_file, fname); + } catch (...) { + if (!is_system_error_errno(ENOENT)) { + throw; + } + sstlog.debug("Forgiving ENOENT when deleting file {}", fname); + } + }); + if (sync) { + co_await sst.sstable_write_io_check(sync_directory, dir_name.native()); + } + co_await sst.sstable_write_io_check(remove_file, new_toc_name); + } } catch (...) { // Log and ignore the failure since there is nothing much we can do about it at this point. // a. Compaction will retry deleting the sstable in the next pass, and From 17fd558df8141d50838236d0cb68c483c0ccea32 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 30 Nov 2023 16:52:53 +0300 Subject: [PATCH 10/10] sstables: Drop remove_by_name() It was used by deletion log replay and by storage wipe, now it's unused Signed-off-by: Pavel Emelyanov --- sstables/sstables.cc | 38 -------------------------------------- sstables/sstables.hh | 4 ---- 2 files changed, 42 deletions(-) diff --git a/sstables/sstables.cc b/sstables/sstables.cc index be0cd53465..8872b24c97 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -2621,44 +2621,6 @@ future make_toc_temporary(sstring sstable_toc_name, storage::sync_dir s co_return new_toc_name; } -future<> remove_by_toc_name(sstring sstable_toc_name, storage::sync_dir sync) { - sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - sstable_version_constants::TOC_SUFFIX.size()); - - auto new_toc_name = co_await make_toc_temporary(sstable_toc_name, sync); - if (new_toc_name.empty()) { - co_return; - } - - auto toc_file = co_await open_checked_file_dma(sstable_write_error_handler, new_toc_name, open_flags::ro); - std::vector components = co_await with_closeable(std::move(toc_file), [] (file& toc_file) { - return sstable::read_and_parse_toc(toc_file); - }); - - co_await coroutine::parallel_for_each(components, [&prefix] (sstring component) -> future<> { - if (component.empty()) { - // eof - co_return; - } - if (component == sstable_version_constants::TOC_SUFFIX) { - // already renamed - co_return; - } - auto fname = prefix + component; - try { - co_await sstable_io_check(sstable_write_error_handler, remove_file, fname); - } catch (...) { - if (!is_system_error_errno(ENOENT)) { - throw; - } - sstlog.debug("Forgiving ENOENT when deleting file {}", fname); - } - }); - if (sync) { - co_await sstable_io_check(sstable_write_error_handler, sync_directory, parent_path(new_toc_name)); - } - co_await sstable_io_check(sstable_write_error_handler, remove_file, new_toc_name); -} - /** * Returns a pair of positions [p1, p2) in the summary file corresponding to * pages which may include keys covered by the specified range, or a disengaged diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 7d67891b4b..afd71095b8 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -1017,10 +1017,6 @@ public: // swallows all errors and just reports them to the log. future<> remove_table_directory_if_has_no_snapshots(fs::path table_dir); -// similar to sstable::unlink, but works on a TOC file name -// Caller may pass sync_dir::no for batching multiple deletes in the same directory, -// and make sure the directory is sync'ed on or after the last call. -future<> remove_by_toc_name(sstring sstable_toc_name, storage::sync_dir sync = storage::sync_dir::yes); // makes sure the TOC file is temporary by moving existing TOC file or otherwise // checking the temporary-TOC already exists // resolves into temporary-TOC file name or empty string if neither TOC nor temp.