From ae0f9c3339c094937d02384c72f4783fffa85517 Mon Sep 17 00:00:00 2001 From: akankshamahajan Date: Mon, 3 Oct 2022 10:59:45 -0700 Subject: [PATCH] Add new property in IOOptions to skip recursing through directories and list only files during GetChildren. (#10668) Summary: Add new property "do_not_recurse" in IOOptions for underlying file system to skip iteration of directories during DB::Open if there are no sub directories and list only files. By default this property is set to false. This property is set true currently in the code where RocksDB is sure only files are needed during DB::Open. Provided support in PosixFileSystem to use "do_not_recurse". TestPlan: - Existing tests Pull Request resolved: https://github.com/facebook/rocksdb/pull/10668 Reviewed By: anand1976 Differential Revision: D39471683 Pulled By: akankshamahajan15 fbshipit-source-id: 90e32f0b86d5346d53bc2714d3a0e7002590527f --- HISTORY.md | 5 ++++- db/db_impl/db_impl.cc | 34 +++++++++++++++++++++++++++------ db/db_impl/db_impl_files.cc | 14 ++++++++++---- db/db_impl/db_impl_open.cc | 31 ++++++++++++++++++++++++------ db/db_impl/db_impl_secondary.cc | 6 +++++- env/fs_posix.cc | 12 ++++++++++-- file/delete_scheduler.cc | 6 +++++- include/rocksdb/file_system.h | 7 ++++++- 8 files changed, 93 insertions(+), 22 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 3ede0b544..2cb3b6201 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,13 +5,16 @@ ### Bug Fixes * Fix a bug in io_uring_prep_cancel in AbortIO API for posix which expects sqe->addr to match with read request submitted and wrong paramter was being passed. -* Fixed a regression in iterator performance when the entire DB is a single memtable introduced in #10449. The fix is in #10705 and #10716. +* Fixed a regression in iterator performance when the entire DB is a single memtable introduced in #10449. The fix is in #10705 and #10716. * Fixed an optimistic transaction validation bug caused by DBImpl::GetLatestSequenceForKey() returning non-latest seq for merge (#10724). * Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739). ### Performance Improvements * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files. +### New Features +* Add a new option IOOptions.do_not_recurse that can be used by underlying file systems to skip recursing through sub directories and list only files in GetChildren API. + ## 7.7.0 (09/18/2022) ### Bug Fixes * Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 7cd8c30d6..a88dac2de 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4403,10 +4403,14 @@ Status DBImpl::CheckConsistency() { } files_by_directory[md.db_path].push_back(fname); } + + IOOptions io_opts; + io_opts.do_not_recurse = true; for (const auto& dir_files : files_by_directory) { std::string directory = dir_files.first; std::vector existing_files; - Status s = env_->GetChildren(directory, &existing_files); + Status s = fs_->GetChildren(directory, io_opts, &existing_files, + /*IODebugContext*=*/nullptr); if (!s.ok()) { corruption_messages += "Can't list files in " + directory + ": " + s.ToString() + "\n"; @@ -4588,8 +4592,12 @@ Status DestroyDB(const std::string& dbname, const Options& options, // Reset the logger because it holds a handle to the // log file and prevents cleanup and directory removal soptions.info_log.reset(); + IOOptions io_opts; // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames).PermitUncheckedError(); + soptions.fs + ->GetChildren(dbname, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .PermitUncheckedError(); FileLock* lock; const std::string lockname = LockFileName(dbname); @@ -4629,8 +4637,12 @@ Status DestroyDB(const std::string& dbname, const Options& options, paths.insert(cf_path.path); } } + for (const auto& path : paths) { - if (env->GetChildren(path, &filenames).ok()) { + if (soptions.fs + ->GetChildren(path, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .ok()) { for (const auto& fname : filenames) { if (ParseFileName(fname, &number, &type) && (type == kTableFile || @@ -4652,7 +4664,11 @@ Status DestroyDB(const std::string& dbname, const Options& options, std::string archivedir = ArchivalDirectory(dbname); bool wal_dir_exists = false; if (!soptions.IsWalDirSameAsDBPath(dbname)) { - wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok(); + wal_dir_exists = + soptions.fs + ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles, + /*IODebugContext*=*/nullptr) + .ok(); archivedir = ArchivalDirectory(soptions.wal_dir); } @@ -4660,7 +4676,10 @@ Status DestroyDB(const std::string& dbname, const Options& options, // processed and removed before those otherwise we have issues // removing them std::vector archiveFiles; - if (env->GetChildren(archivedir, &archiveFiles).ok()) { + if (soptions.fs + ->GetChildren(archivedir, io_opts, &archiveFiles, + /*IODebugContext*=*/nullptr) + .ok()) { // Delete archival files. for (const auto& file : archiveFiles) { if (ParseFileName(file, &number, &type) && type == kWalFile) { @@ -4801,7 +4820,10 @@ Status DBImpl::DeleteObsoleteOptionsFiles() { // to the oldest. std::map options_filenames; Status s; - s = GetEnv()->GetChildren(GetName(), &filenames); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs_->GetChildren(GetName(), io_opts, &filenames, + /*IODebugContext*=*/nullptr); if (!s.ok()) { return s; } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index da87aa5cd..058df4da7 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -193,11 +193,14 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } } + IOOptions io_opts; + io_opts.do_not_recurse = true; for (auto& path : paths) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - Status s = env_->GetChildren(path, &files); + Status s = immutable_db_options_.fs->GetChildren( + path, io_opts, &files, /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& file : files) { uint64_t number; @@ -223,7 +226,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // Add log files in wal_dir if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) { std::vector log_files; - Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files); + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.wal_dir, io_opts, &log_files, + /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& log_file : log_files) { job_context->full_scan_candidate_files.emplace_back( @@ -235,8 +240,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, if (!immutable_db_options_.db_log_dir.empty() && immutable_db_options_.db_log_dir != dbname_) { std::vector info_log_files; - Status s = - env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.db_log_dir, io_opts, &info_log_files, + /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); // TODO: What should we do on error? for (std::string& log_file : info_log_files) { job_context->full_scan_candidate_files.emplace_back( diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 498c92b39..199a066b3 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -163,8 +163,11 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, // DeleteScheduler::CleanupDirectory on the same dir later, it will be // safe std::vector filenames; + IOOptions io_opts; + io_opts.do_not_recurse = true; auto wal_dir = immutable_db_options.GetWalDir(); - Status s = result.env->GetChildren(wal_dir, &filenames); + Status s = immutable_db_options.fs->GetChildren( + wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); //**TODO: What to do on error? for (std::string& filename : filenames) { if (filename.find(".log.trash", filename.length() - @@ -432,7 +435,10 @@ Status DBImpl::Recover( s = env_->FileExists(current_fname); } else { s = Status::NotFound(); - Status io_s = env_->GetChildren(dbname_, &files_in_dbname); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status io_s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); if (!io_s.ok()) { s = io_s; files_in_dbname.clear(); @@ -498,7 +504,10 @@ Status DBImpl::Recover( } } else if (immutable_db_options_.best_efforts_recovery) { assert(files_in_dbname.empty()); - Status s = env_->GetChildren(dbname_, &files_in_dbname); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); if (s.IsNotFound()) { return Status::InvalidArgument(dbname_, "does not exist (open for read only)"); @@ -570,7 +579,10 @@ Status DBImpl::Recover( // produced by an older version of rocksdb. auto wal_dir = immutable_db_options_.GetWalDir(); if (!immutable_db_options_.best_efforts_recovery) { - s = env_->GetChildren(wal_dir, &files_in_wal_dir); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr); } if (s.IsNotFound()) { return Status::InvalidArgument("wal_dir not found", wal_dir); @@ -678,7 +690,10 @@ Status DBImpl::Recover( } else if (normalized_dbname == normalized_wal_dir) { filenames = std::move(files_in_wal_dir); } else { - s = env_->GetChildren(GetName(), &filenames); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr); } } if (s.ok()) { @@ -2022,9 +2037,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // Remove duplicate paths. std::sort(paths.begin(), paths.end()); paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + IOOptions io_opts; + io_opts.do_not_recurse = true; for (auto& path : paths) { std::vector existing_files; - impl->immutable_db_options_.env->GetChildren(path, &existing_files) + impl->immutable_db_options_.fs + ->GetChildren(path, io_opts, &existing_files, + /*IODebugContext*=*/nullptr) .PermitUncheckedError(); //**TODO: What do to on error? for (auto& file_name : existing_files) { uint64_t file_number; diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 40e85ee85..f009352c2 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -100,7 +100,11 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { assert(logs != nullptr); std::vector filenames; Status s; - s = env_->GetChildren(immutable_db_options_.GetWalDir(), &filenames); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(), + io_opts, &filenames, + /*IODebugContext*=*/nullptr); if (s.IsNotFound()) { return Status::InvalidArgument("Failed to open wal_dir", immutable_db_options_.GetWalDir()); diff --git a/env/fs_posix.cc b/env/fs_posix.cc index a6f7b9c08..e179a421d 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -575,7 +575,7 @@ class PosixFileSystem : public FileSystem { } } - IOStatus GetChildren(const std::string& dir, const IOOptions& /*opts*/, + IOStatus GetChildren(const std::string& dir, const IOOptions& opts, std::vector* result, IODebugContext* /*dbg*/) override { result->clear(); @@ -595,12 +595,20 @@ class PosixFileSystem : public FileSystem { // reset errno before calling readdir() errno = 0; struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { // filter out '.' and '..' directory entries // which appear only on some platforms const bool ignore = entry->d_type == DT_DIR && - (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0); + (strcmp(entry->d_name, ".") == 0 || + strcmp(entry->d_name, "..") == 0 +#ifndef ASSERT_STATUS_CHECKED + // In case of ASSERT_STATUS_CHECKED, GetChildren support older + // version of API for debugging purpose. + || opts.do_not_recurse +#endif + ); if (!ignore) { result->push_back(entry->d_name); } diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 4933c15b5..300bf0f8f 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -141,7 +141,11 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm, Status s; // Check if there are any files marked as trash in this path std::vector files_in_path; - s = env->GetChildren(path, &files_in_path); + const auto& fs = env->GetFileSystem(); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs->GetChildren(path, io_opts, &files_in_path, + /*IODebugContext*=*/nullptr); if (!s.ok()) { return s; } diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index ee8362eab..547faa5a4 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -112,6 +112,10 @@ struct IOOptions { // fsync, set this to force the fsync bool force_dir_fsync; + // Can be used by underlying file systems to skip recursing through sub + // directories and list only files in GetChildren API. + bool do_not_recurse; + IOOptions() : IOOptions(false) {} explicit IOOptions(bool force_dir_fsync_) @@ -119,7 +123,8 @@ struct IOOptions { prio(IOPriority::kIOLow), rate_limiter_priority(Env::IO_TOTAL), type(IOType::kUnknown), - force_dir_fsync(force_dir_fsync_) {} + force_dir_fsync(force_dir_fsync_), + do_not_recurse(false) {} }; struct DirFsyncOptions {