From 29102641ddf4632bddf34bbd35de9637ea526dee Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Wed, 3 Nov 2021 12:20:19 -0700 Subject: [PATCH] Skip directory fsync for filesystem btrfs (#8903) Summary: Directory fsync might be expensive on btrfs and it may not be needed. Here are 4 directory fsync cases: 1. creating a new file: dir-fsync is not needed on btrfs, as long as the new file itself is synced. 2. renaming a file: dir-fsync is not needed if the renamed file is synced. So an API `FsyncAfterFileRename(filename, ...)` is provided to sync the file on btrfs. By default, it just calls dir-fsync. 3. deleting files: dir-fsync is forced by set `IOOptions.force_dir_fsync = true` 4. renaming multiple files (like backup and checkpoint): dir-fsync is forced, the same as above. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8903 Test Plan: run tests on btrfs and non btrfs Reviewed By: ajkr Differential Revision: D30885059 Pulled By: jay-zhuang fbshipit-source-id: dd2730b31580b0bcaedffc318a762d7dbf25de4a --- HISTORY.md | 2 + Makefile | 3 + db/compaction/compaction_job.cc | 11 +- db/db_impl/db_impl.cc | 12 ++- db/db_impl/db_impl.h | 3 + db/db_impl/db_impl_compaction_flush.cc | 8 +- db/db_impl/db_impl_debug.cc | 8 ++ db/db_impl/db_impl_files.cc | 8 +- db/db_impl/db_impl_open.cc | 6 +- db/db_impl/db_impl_write.cc | 4 +- db/db_secondary_test.cc | 2 +- db/deletefile_test.cc | 55 ++++++++-- db/external_sst_file_ingestion_job.cc | 4 +- db/flush_job.cc | 4 +- env/composite_env.cc | 2 +- env/file_system.cc | 12 +++ env/io_posix.cc | 52 ++++++++- env/io_posix.h | 7 +- file/delete_scheduler.cc | 4 +- file/filename.cc | 15 ++- include/rocksdb/file_system.h | 47 ++++++++- microbench/db_basic_bench.cc | 134 ++++++++++++++++++++++++ util/timer_test.cc | 1 + utilities/backupable/backupable_db.cc | 34 +++--- utilities/blob_db/blob_db_impl.cc | 9 +- utilities/blob_db/blob_db_impl.h | 2 +- utilities/checkpoint/checkpoint_impl.cc | 18 ++-- utilities/fault_injection_fs.cc | 23 ++++ utilities/fault_injection_fs.h | 4 + 29 files changed, 440 insertions(+), 54 deletions(-) create mode 100644 microbench/db_basic_bench.cc diff --git a/HISTORY.md b/HISTORY.md index 499d5025d..e915ec8e9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -20,6 +20,8 @@ * Made FileSystem extend the Customizable class and added a CreateFromString method. Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method. * Clarified in API comments that RocksDB is not exception safe for callbacks and custom extensions. An exception propagating into RocksDB can lead to undefined behavior, including data loss, unreported corruption, deadlocks, and more. * Marked `WriteBufferManager` as `final` because it is not intended for extension. +* Add API `FSDirectory::FsyncWithDirOptions()`, which provides extra information like directory fsync reason in `DirFsyncOptions`. File system like btrfs is using that to skip directory fsync for creating a new file, or when renaming a file, fsync the target file instead of the directory, which improves the `DB::Open()` speed by ~20%. +* `DB::Open()` is not going be blocked by obsolete file purge if `DBOptions::avoid_unnecessary_blocking_io` is set to true. ## 6.26.0 (2021-10-20) ### Bug Fixes diff --git a/Makefile b/Makefile index c3aa71f12..fb16a213e 100644 --- a/Makefile +++ b/Makefile @@ -1926,6 +1926,9 @@ clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST ribbon_bench: $(OBJ_DIR)/microbench/ribbon_bench.o $(LIBRARY) $(AM_LINK) +db_basic_bench: $(OBJ_DIR)/microbench/db_basic_bench.o $(LIBRARY) + $(AM_LINK) + cache_reservation_manager_test: $(OBJ_DIR)/cache/cache_reservation_manager_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) #------------------------------------------------- diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 1d5ccc1d5..4958af3c9 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -763,12 +763,16 @@ Status CompactionJob::Run() { constexpr IODebugContext* dbg = nullptr; if (output_directory_) { - io_s = output_directory_->Fsync(IOOptions(), dbg); + io_s = output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && blob_output_directory_ != output_directory_) { - io_s = blob_output_directory_->Fsync(IOOptions(), dbg); + io_s = blob_output_directory_->FsyncWithDirOptions( + IOOptions(), dbg, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } } if (io_status_.ok()) { @@ -2442,7 +2446,8 @@ Status CompactionServiceCompactionJob::Run() { constexpr IODebugContext* dbg = nullptr; if (output_directory_) { - io_s = output_directory_->Fsync(IOOptions(), dbg); + io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg, + DirFsyncOptions()); } } if (io_status_.ok()) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index ce4ad580f..8e3e0decc 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1389,7 +1389,9 @@ Status DBImpl::SyncWAL() { IOStatusCheck(io_s); } if (status.ok() && need_log_dir_sync) { - status = directories_.GetWalDir()->Fsync(IOOptions(), nullptr); + status = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); @@ -4288,6 +4290,14 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { if (s.ok()) { // Retry if the file name happen to conflict with an existing one. s = GetEnv()->RenameFile(file_name, options_file_name); + std::unique_ptr dir_obj; + if (s.ok()) { + s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(options_file_name)); + } } if (s.ok()) { InstrumentedMutexLock l(&mutex_); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index c07eea23b..5dc8fa8f8 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -974,6 +974,9 @@ class DBImpl : public DB { // is only for the special test of CancelledCompactions Status TEST_WaitForCompact(bool waitUnscheduled = false); + // Wait for any background purge + Status TEST_WaitForPurge(); + // Get the background error status Status TEST_GetBGError(); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 6d7b0478a..d2a49fbbf 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -120,7 +120,9 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { } } if (io_s.ok()) { - io_s = directories_.GetWalDir()->Fsync(IOOptions(), nullptr); + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } mutex_.Lock(); @@ -532,7 +534,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { - Status error_status = dir->Fsync(IOOptions(), nullptr); + Status error_status = dir->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); if (!error_status.ok()) { s = error_status; break; diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 821925f92..2bdc66688 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -184,6 +184,14 @@ Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { return error_handler_.GetBGError(); } +Status DBImpl::TEST_WaitForPurge() { + InstrumentedMutexLock l(&mutex_); + while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + Status DBImpl::TEST_GetBGError() { InstrumentedMutexLock l(&mutex_); return error_handler_.GetBGError(); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 6ae170c69..6007d2ffb 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -655,7 +655,13 @@ void DBImpl::DeleteObsoleteFiles() { mutex_.Unlock(); if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); + bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io; + PurgeObsoleteFiles(job_context, defer_purge); + if (defer_purge) { + mutex_.Lock(); + SchedulePurge(); + mutex_.Unlock(); + } } job_context.Clean(); mutex_.Lock(); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 157c3304c..2caca7b3e 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1694,10 +1694,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (impl->two_write_queues_) { impl->log_write_mutex_.Unlock(); } - - impl->DeleteObsoleteFiles(); - s = impl->directories_.GetDbDir()->Fsync(IOOptions(), nullptr); - TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFilesAndSyncDir"); } if (s.ok()) { // In WritePrepared there could be gap in sequence numbers. This breaks @@ -1770,6 +1766,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, *dbptr = impl; impl->opened_successfully_ = true; + impl->DeleteObsoleteFiles(); + TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles"); impl->MaybeScheduleFlushOrCompaction(); } else { persist_options_status.PermitUncheckedError(); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 1eba26405..a8fdbda96 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -1142,7 +1142,9 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, // We only sync WAL directory the first time WAL syncing is // requested, so that in case users never turn on WAL sync, // we can avoid the disk I/O in the write code path. - io_s = directories_.GetWalDir()->Fsync(IOOptions(), nullptr); + io_s = directories_.GetWalDir()->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } } diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 35a7bd2cf..4f1a31dcf 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -690,7 +690,7 @@ TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { SyncPoint::GetInstance()->LoadDependency( {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, - {"DBImpl::Open:AfterDeleteFilesAndSyncDir", + {"DBImpl::Open:AfterDeleteFiles", "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" "1"}}); SyncPoint::GetInstance()->EnableProcessing(); diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 924d71e50..7b2031781 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -117,9 +117,15 @@ class DeleteFileTest : public DBTestBase { manifest_cnt += (type == kDescriptorFile); } } - ASSERT_EQ(required_log, log_cnt); - ASSERT_EQ(required_sst, sst_cnt); - ASSERT_EQ(required_manifest, manifest_cnt); + if (required_log >= 0) { + ASSERT_EQ(required_log, log_cnt); + } + if (required_sst >= 0) { + ASSERT_EQ(required_sst, sst_cnt); + } + if (required_manifest >= 0) { + ASSERT_EQ(required_manifest, manifest_cnt); + } } static void DoSleep(void* arg) { @@ -264,6 +270,41 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { CheckFileTypeCounts(dbname_, 0, 1, 1); } +TEST_F(DeleteFileTest, PurgeDuringOpen) { + Options options = CurrentOptions(); + CheckFileTypeCounts(dbname_, -1, 0, -1); + Close(); + std::unique_ptr file; + ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file, + EnvOptions())); + ASSERT_OK(file->Close()); + CheckFileTypeCounts(dbname_, -1, 1, -1); + options.avoid_unnecessary_blocking_io = false; + options.create_if_missing = false; + Reopen(options); + CheckFileTypeCounts(dbname_, -1, 0, -1); + Close(); + + // test background purge + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file, + EnvOptions())); + ASSERT_OK(file->Close()); + CheckFileTypeCounts(dbname_, -1, 1, -1); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + // the obsolete file is not deleted until the background purge job is ran + CheckFileTypeCounts(dbname_, -1, 1, -1); + TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1"); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); + CheckFileTypeCounts(dbname_, -1, 0, -1); +} + TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { Options options = CurrentOptions(); SetOptions(&options); @@ -310,6 +351,11 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { do_test(false); } + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + Reopen(options); + ASSERT_OK(dbfull()->TEST_WaitForPurge()); + SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->LoadDependency( @@ -317,9 +363,6 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { "DBImpl::BGWorkPurge:start"}}); SyncPoint::GetInstance()->EnableProcessing(); - options.avoid_unnecessary_blocking_io = true; - options.create_if_missing = false; - Reopen(options); { SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); do_test(true); diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 437b40424..2d617b2a3 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -168,7 +168,9 @@ Status ExternalSstFileIngestionJob::Prepare( TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); if (status.ok()) { for (auto path_id : ingestion_path_ids) { - status = directories_->GetDataDir(path_id)->Fsync(IOOptions(), nullptr); + status = directories_->GetDataDir(path_id)->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); if (!status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Failed to sync directory %" ROCKSDB_PRIszt diff --git a/db/flush_job.cc b/db/flush_job.cc index a6b93f2b2..c3a70d160 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -937,7 +937,9 @@ Status FlushJob::WriteLevel0Table() { meta_.marked_for_compaction ? " (needs compaction)" : ""); if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) { - s = output_file_directory_->Fsync(IOOptions(), nullptr); + s = output_file_directory_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); } TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); db_mutex_->Lock(); diff --git a/env/composite_env.cc b/env/composite_env.cc index 0d70855af..98cd0d6f4 100644 --- a/env/composite_env.cc +++ b/env/composite_env.cc @@ -272,7 +272,7 @@ class CompositeDirectoryWrapper : public Directory { Status Fsync() override { IOOptions io_opts; IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); + return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions()); } size_t GetUniqueId(char* id, size_t max_size) const override { return target_->GetUniqueId(id, max_size); diff --git a/env/file_system.cc b/env/file_system.cc index 90edc4b27..fbbfe6b12 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -241,4 +241,16 @@ std::string FileSystemWrapper::SerializeOptions( } } #endif // ROCKSDB_LITE + +DirFsyncOptions::DirFsyncOptions() { reason = kDefault; } + +DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) { + reason = kFileRenamed; + renamed_new_name = file_renamed_new_name; +} + +DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) { + assert(fsync_reason != kFileRenamed); + reason = fsync_reason; +} } // namespace ROCKSDB_NAMESPACE diff --git a/env/io_posix.cc b/env/io_posix.cc index ccdb33825..8b09cce57 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -1534,17 +1534,61 @@ PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() { /* * PosixDirectory */ +#if !defined(BTRFS_SUPER_MAGIC) +// The magic number for BTRFS is fixed, if it's not defined, define it here +#define BTRFS_SUPER_MAGIC 0x9123683E +#endif +PosixDirectory::PosixDirectory(int fd) : fd_(fd) { + is_btrfs_ = false; +#ifdef OS_LINUX + struct statfs buf; + int ret = fstatfs(fd, &buf); + is_btrfs_ = (ret == 0 && buf.f_type == BTRFS_SUPER_MAGIC); +#endif +} PosixDirectory::~PosixDirectory() { close(fd_); } -IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/, - IODebugContext* /*dbg*/) { +IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) { + return FsyncWithDirOptions(opts, dbg, DirFsyncOptions()); +} + +IOStatus PosixDirectory::FsyncWithDirOptions( + const IOOptions& /*opts*/, IODebugContext* /*dbg*/, + const DirFsyncOptions& dir_fsync_options) { + IOStatus s = IOStatus::OK(); #ifndef OS_AIX + if (is_btrfs_) { + // skip dir fsync for new file creation, which is not needed for btrfs + if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) { + return s; + } + // skip dir fsync for renaming file, only need to sync new file + if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) { + std::string new_name = dir_fsync_options.renamed_new_name; + assert(!new_name.empty()); + int fd; + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(new_name.c_str(), O_RDONLY); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError("While open renaming file", new_name, errno); + } else if (fsync(fd) < 0) { + s = IOError("While fsync renaming file", new_name, errno); + } + if (close(fd) < 0) { + s = IOError("While closing file after fsync", new_name, errno); + } + return s; + } + // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted + } if (fsync(fd_) == -1) { - return IOError("While fsync", "a directory", errno); + s = IOError("While fsync", "a directory", errno); } #endif - return IOStatus::OK(); + return s; } } // namespace ROCKSDB_NAMESPACE #endif diff --git a/env/io_posix.h b/env/io_posix.h index 2d8e83c9d..94f579c50 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -391,12 +391,17 @@ struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer { class PosixDirectory : public FSDirectory { public: - explicit PosixDirectory(int fd) : fd_(fd) {} + explicit PosixDirectory(int fd); ~PosixDirectory(); virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; + virtual IOStatus FsyncWithDirOptions( + const IOOptions&, IODebugContext*, + const DirFsyncOptions& dir_fsync_options) override; + private: int fd_; + bool is_btrfs_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 5f9994074..4933c15b5 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -357,7 +357,9 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr); } if (s.ok()) { - s = dir_obj->Fsync(IOOptions(), nullptr); + s = dir_obj->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); TEST_SYNC_POINT_CALLBACK( "DeleteScheduler::DeleteTrashFile::AfterSyncDir", reinterpret_cast(const_cast(&dir_to_sync))); diff --git a/file/filename.cc b/file/filename.cc index 4f22b5da3..358f88cf0 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -405,7 +405,8 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, } if (s.ok()) { if (directory_to_fsync != nullptr) { - s = directory_to_fsync->Fsync(IOOptions(), nullptr); + s = directory_to_fsync->FsyncWithDirOptions( + IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname))); } } else { fs->DeleteFile(tmp, IOOptions(), nullptr) @@ -428,9 +429,19 @@ Status SetIdentityFile(Env* env, const std::string& dbname, assert(!id.empty()); // Reserve the filename dbname/000000.dbtmp for the temporary identity file std::string tmp = TempFileName(dbname, 0); + std::string identify_file_name = IdentityFileName(dbname); Status s = WriteStringToFile(env, id, tmp, true); if (s.ok()) { - s = env->RenameFile(tmp, IdentityFileName(dbname)); + s = env->RenameFile(tmp, identify_file_name); + } + std::unique_ptr dir_obj; + if (s.ok()) { + s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj, + nullptr); + } + if (s.ok()) { + s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr, + DirFsyncOptions(identify_file_name)); } if (!s.ok()) { env->DeleteFile(tmp).PermitUncheckedError(); diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 0c409a988..6cab02487 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -100,7 +100,36 @@ struct IOOptions { // such as NewRandomAccessFile and NewWritableFile. std::unordered_map property_bag; - IOOptions() : timeout(0), prio(IOPriority::kIOLow), type(IOType::kUnknown) {} + // Force directory fsync, some file systems like btrfs may skip directory + // fsync, set this to force the fsync + bool force_dir_fsync; + + IOOptions() : IOOptions(false) {} + + explicit IOOptions(bool force_dir_fsync_) + : timeout(std::chrono::microseconds::zero()), + prio(IOPriority::kIOLow), + type(IOType::kUnknown), + force_dir_fsync(force_dir_fsync_) {} +}; + +struct DirFsyncOptions { + enum FsyncReason : uint8_t { + kNewFileSynced, + kFileRenamed, + kDirRenamed, + kFileDeleted, + kDefault, + } reason; + + std::string renamed_new_name; // for kFileRenamed + // add other options for other FsyncReason + + DirFsyncOptions(); + + explicit DirFsyncOptions(std::string file_renamed_new_name); + + explicit DirFsyncOptions(FsyncReason fsync_reason); }; // File scope options that control how a file is opened/created and accessed @@ -1111,6 +1140,15 @@ class FSDirectory { // Fsync directory. Can be called concurrently from multiple threads. virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0; + // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it + // may fsync directory or just the renaming file (e.g. btrfs). By default, it + // just calls directory fsync. + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& /*dir_fsync_options*/) { + return Fsync(options, dbg); + } + virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { return 0; } @@ -1623,6 +1661,13 @@ class FSDirectoryWrapper : public FSDirectory { IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { return target_->Fsync(options, dbg); } + + IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override { + return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + } + size_t GetUniqueId(char* id, size_t max_size) const override { return target_->GetUniqueId(id, max_size); } diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc new file mode 100644 index 000000000..eb4ca73ad --- /dev/null +++ b/microbench/db_basic_bench.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// this is a simple micro-benchmark for compare ribbon filter vs. other filter +// for more comprehensive, please check the dedicate util/filter_bench. +#include + +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +static void DBOpen(benchmark::State& state) { + // create DB + DB* db; + Options options; + auto env = Env::Default(); + std::string db_path; + auto s = env->GetTestDirectory(&db_path); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + std::string db_name = db_path + "/bench_dbopen"; + + DestroyDB(db_name, options); + + options.create_if_missing = true; + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + db->Close(); + + options.create_if_missing = false; + + auto rnd = Random(12345); + + for (auto _ : state) { + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + state.PauseTiming(); + auto wo = WriteOptions(); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 100; j++) { + s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100)); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + s = db->Flush(FlushOptions()); + } + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + s = db->Close(); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + state.ResumeTiming(); + } + DestroyDB(db_name, options); +} + +BENCHMARK(DBOpen)->Iterations(200); // specify iteration number as the db size + // is impacted by iteration number + +static void DBClose(benchmark::State& state) { + // create DB + DB* db; + Options options; + auto env = Env::Default(); + std::string db_path; + auto s = env->GetTestDirectory(&db_path); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + std::string db_name = db_path + "/bench_dbclose"; + + DestroyDB(db_name, options); + + options.create_if_missing = true; + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + db->Close(); + + options.create_if_missing = false; + + auto rnd = Random(12345); + + for (auto _ : state) { + state.PauseTiming(); + s = DB::Open(options, db_name, &db); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + auto wo = WriteOptions(); + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 100; j++) { + s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100)); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + s = db->Flush(FlushOptions()); + } + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + state.ResumeTiming(); + s = db->Close(); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + DestroyDB(db_name, options); +} + +BENCHMARK(DBClose)->Iterations(200); // specify iteration number as the db size + // is impacted by iteration number + +} // namespace ROCKSDB_NAMESPACE + +BENCHMARK_MAIN(); diff --git a/util/timer_test.cc b/util/timer_test.cc index a845e8ed7..9256fcd45 100644 --- a/util/timer_test.cc +++ b/util/timer_test.cc @@ -6,6 +6,7 @@ #include "util/timer.h" #include "db/db_test_util.h" +#include "rocksdb/file_system.h" #include "test_util/mock_time_env.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 9249fbfdf..e49525c5c 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -1438,19 +1438,24 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( io_options_, &backup_private_directory, nullptr) .PermitUncheckedError(); if (backup_private_directory != nullptr) { - io_s = backup_private_directory->Fsync(io_options_, nullptr); + io_s = backup_private_directory->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } if (io_s.ok() && private_directory_ != nullptr) { - io_s = private_directory_->Fsync(io_options_, nullptr); + io_s = private_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } if (io_s.ok() && meta_directory_ != nullptr) { - io_s = meta_directory_->Fsync(io_options_, nullptr); + io_s = meta_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } if (io_s.ok() && shared_directory_ != nullptr) { - io_s = shared_directory_->Fsync(io_options_, nullptr); + io_s = shared_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } if (io_s.ok() && backup_directory_ != nullptr) { - io_s = backup_directory_->Fsync(io_options_, nullptr); + io_s = backup_directory_->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } } @@ -1832,15 +1837,17 @@ IOStatus BackupEngineImpl::RestoreDBFromBackup( } } - // When enabled, the first Fsync is to ensure all files are fully persisted - // before renaming CURRENT.tmp + // When enabled, the first FsyncWithDirOptions is to ensure all files are + // fully persisted before renaming CURRENT.tmp if (io_s.ok() && db_dir_for_fsync) { ROCKS_LOG_INFO(options_.info_log, "Restore: fsync\n"); - io_s = db_dir_for_fsync->Fsync(io_options_, nullptr); + io_s = db_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } if (io_s.ok() && wal_dir_for_fsync) { - io_s = wal_dir_for_fsync->Fsync(io_options_, nullptr); + io_s = wal_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr, + DirFsyncOptions()); } if (io_s.ok() && !temporary_current_file.empty()) { @@ -1851,11 +1858,12 @@ IOStatus BackupEngineImpl::RestoreDBFromBackup( } if (io_s.ok() && db_dir_for_fsync && !temporary_current_file.empty()) { - // Second Fsync is to ensure the final atomic rename of DB restore is - // fully persisted even if power goes out right after restore operation - // returns success + // Second FsyncWithDirOptions is to ensure the final atomic rename of DB + // restore is fully persisted even if power goes out right after restore + // operation returns success assert(db_dir_for_fsync); - io_s = db_dir_for_fsync->Fsync(io_options_, nullptr); + io_s = db_dir_for_fsync->FsyncWithDirOptions( + io_options_, nullptr, DirFsyncOptions(final_current_file)); } ROCKS_LOG_INFO(options_.info_log, "Restoring done -- %s\n", diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index e3e0befe4..55c655152 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -177,7 +177,8 @@ Status BlobDBImpl::Open(std::vector* handles) { "Failed to create blob_dir %s, status: %s", blob_dir_.c_str(), s.ToString().c_str()); } - s = env_->NewDirectory(blob_dir_, &dir_ent_); + s = env_->GetFileSystem()->NewDirectory(blob_dir_, IOOptions(), &dir_ent_, + nullptr); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(), @@ -1913,7 +1914,7 @@ Status BlobDBImpl::SyncBlobFiles() { } } - s = dir_ent_->Fsync(); + s = dir_ent_->FsyncWithDirOptions(IOOptions(), nullptr, DirFsyncOptions()); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync blob directory, status: %s", @@ -2005,7 +2006,9 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { // directory change. Fsync if (file_deleted) { - Status s = dir_ent_->Fsync(); + Status s = dir_ent_->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted)); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s", blob_dir_.c_str(), s.ToString().c_str()); diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index ea7554e8f..3282011f3 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -419,7 +419,7 @@ class BlobDBImpl : public BlobDB { std::string blob_dir_; // pointer to directory - std::unique_ptr dir_ent_; + std::unique_ptr dir_ent_; // Read Write Mutex, which protects all the data structures // HEAVILY TRAFFICKED diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index a1f2e87a2..f2536667f 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -160,10 +160,13 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir); } if (s.ok()) { - std::unique_ptr checkpoint_directory; - s = db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory); + std::unique_ptr checkpoint_directory; + s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(), + &checkpoint_directory, nullptr); if (s.ok() && checkpoint_directory != nullptr) { - s = checkpoint_directory->Fsync(); + s = checkpoint_directory->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed)); } } @@ -348,11 +351,14 @@ Status CheckpointImpl::ExportColumnFamily( if (s.ok()) { // Fsync export directory. moved_to_user_specified_dir = true; - std::unique_ptr dir_ptr; - s = db_->GetEnv()->NewDirectory(export_dir, &dir_ptr); + std::unique_ptr dir_ptr; + s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr, + nullptr); if (s.ok()) { assert(dir_ptr != nullptr); - s = dir_ptr->Fsync(); + s = dir_ptr->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed)); } } diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index aef645e6c..a07476bcd 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -108,6 +108,29 @@ IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) { return s; } +IOStatus TestFSDirectory::FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) { + if (!fs_->IsFilesystemActive()) { + return fs_->GetError(); + } + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + fs_->SyncDir(dirname_); + IOStatus s = dir_->FsyncWithDirOptions(options, dbg, dir_fsync_options); + { + IOStatus in_s = fs_->InjectMetadataWriteError(); + if (!in_s.ok()) { + return in_s; + } + } + return s; +} + TestFSWritableFile::TestFSWritableFile(const std::string& fname, const FileOptions& file_opts, std::unique_ptr&& f, diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index 868b9809c..ae9eda952 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -177,6 +177,10 @@ class TestFSDirectory : public FSDirectory { virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + virtual IOStatus FsyncWithDirOptions( + const IOOptions& options, IODebugContext* dbg, + const DirFsyncOptions& dir_fsync_options) override; + private: FaultInjectionTestFS* fs_; std::string dirname_;