From 82425fd24a22a7d0641eb98d0bf512ca4b6533ab Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Thu, 13 Aug 2015 12:37:24 -0300 Subject: [PATCH] sstables: initial work on handling a partially written sstable The solution was proposed by Nadav. When writing a new sstable, write all usual files, write the TOC to a temporary file, and then rename it, which is atomic. Files not belonging to any TOC are invalid, so we ensure that partially written sstables aren't reused. Avi also proposed using fsync on the sstable directory to guarantee that the files reached the disk before sealing the sstable. Subsequently, we should add code to avoid loading sstable which TOC is either temporary or doesn't exist. Temporary TOC files should also be deleted. Signed-off-by: Raphael S. Carvalho --- sstables/sstables.cc | 26 +++++++++++++++++++++++--- sstables/sstables.hh | 3 ++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/sstables/sstables.cc b/sstables/sstables.cc index 3e35b439fb..75be849dd1 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -679,7 +679,9 @@ future<> sstable::read_toc() { } void sstable::write_toc() { - auto file_path = filename(sstable::component_type::TOC); + // Create TOC file with the string 'tmp-' prepended to it, meaning TOC + // is a temporary file. + auto file_path = temporary_filename(sstable::component_type::TOC); sstlog.debug("Writing TOC file {} ", file_path); @@ -695,6 +697,16 @@ void sstable::write_toc() { } w.flush().get(); w.close().get(); + + file dir_f = engine().open_directory(_dir).get0(); + // Guarantee that every component of this sstable reached the disk. + dir_f.flush().get(); + // Rename TOC because it's no longer temporary. + engine().rename_file(file_path, filename(sstable::component_type::TOC)).get(); + // Guarantee that the changes above reached the disk. + dir_f.flush().get(); + dir_f.close().get(); + // If this point was reached, sstable should be safe in disk. } void write_crc(const sstring file_path, checksum& c) { @@ -1368,8 +1380,12 @@ const sstring sstable::filename(component_type f) { return filename(_dir, _ks, _cf, _version, _generation, _format, f); } +const sstring sstable::temporary_filename(component_type f) { + return filename(_dir, _ks, _cf, _version, _generation, _format, f, true); +} + const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_types version, unsigned long generation, - format_types format, component_type component) { + format_types format, component_type component, bool temporary) { static std::unordered_map, enum_hash> strmap = { { sstable::version_types::ka, [] (entry_descriptor d) { @@ -1380,7 +1396,11 @@ const sstring sstable::filename(sstring dir, sstring ks, sstring cf, version_typ } }; - return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component)); + if (temporary) { + return dir + "/tmp-" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component)); + } else { + return dir + "/" + strmap[version](entry_descriptor(ks, cf, version, generation, format, component)); + } } entry_descriptor entry_descriptor::make_descriptor(sstring fname) { diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 92299191da..e51a5f199c 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -167,7 +167,7 @@ public: static version_types version_from_sstring(sstring& s); static format_types format_from_sstring(sstring& s); static const sstring filename(sstring dir, sstring ks, sstring cf, version_types version, unsigned long generation, - format_types format, component_type component); + format_types format, component_type component, bool temporary = false); future<> load(); @@ -281,6 +281,7 @@ private: const bool has_component(component_type f); const sstring filename(component_type f); + const sstring temporary_filename(component_type f); template future<> read_simple(T& comp);