diff --git a/HISTORY.md b/HISTORY.md index 3ede0b544..2cb3b6201 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,13 +5,16 @@ ### Bug Fixes * Fix a bug in io_uring_prep_cancel in AbortIO API for posix which expects sqe->addr to match with read request submitted and wrong paramter was being passed. -* Fixed a regression in iterator performance when the entire DB is a single memtable introduced in #10449. The fix is in #10705 and #10716. +* Fixed a regression in iterator performance when the entire DB is a single memtable introduced in #10449. The fix is in #10705 and #10716. * Fixed an optimistic transaction validation bug caused by DBImpl::GetLatestSequenceForKey() returning non-latest seq for merge (#10724). * Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739). ### Performance Improvements * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files. +### New Features +* Add a new option IOOptions.do_not_recurse that can be used by underlying file systems to skip recursing through sub directories and list only files in GetChildren API. + ## 7.7.0 (09/18/2022) ### Bug Fixes * Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 7cd8c30d6..a88dac2de 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4403,10 +4403,14 @@ Status DBImpl::CheckConsistency() { } files_by_directory[md.db_path].push_back(fname); } + + IOOptions io_opts; + io_opts.do_not_recurse = true; for (const auto& dir_files : files_by_directory) { std::string directory = dir_files.first; std::vector existing_files; - Status s = env_->GetChildren(directory, &existing_files); + Status s = fs_->GetChildren(directory, io_opts, &existing_files, + /*IODebugContext*=*/nullptr); if (!s.ok()) { corruption_messages += "Can't list files in " + directory + ": " + s.ToString() + "\n"; @@ -4588,8 +4592,12 @@ Status DestroyDB(const std::string& dbname, const Options& options, // Reset the logger because it holds a handle to the // log file and prevents cleanup and directory removal soptions.info_log.reset(); + IOOptions io_opts; // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames).PermitUncheckedError(); + soptions.fs + ->GetChildren(dbname, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .PermitUncheckedError(); FileLock* lock; const std::string lockname = LockFileName(dbname); @@ -4629,8 +4637,12 @@ Status DestroyDB(const std::string& dbname, const Options& options, paths.insert(cf_path.path); } } + for (const auto& path : paths) { - if (env->GetChildren(path, &filenames).ok()) { + if (soptions.fs + ->GetChildren(path, io_opts, &filenames, + /*IODebugContext*=*/nullptr) + .ok()) { for (const auto& fname : filenames) { if (ParseFileName(fname, &number, &type) && (type == kTableFile || @@ -4652,7 +4664,11 @@ Status DestroyDB(const std::string& dbname, const Options& options, std::string archivedir = ArchivalDirectory(dbname); bool wal_dir_exists = false; if (!soptions.IsWalDirSameAsDBPath(dbname)) { - wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok(); + wal_dir_exists = + soptions.fs + ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles, + /*IODebugContext*=*/nullptr) + .ok(); archivedir = ArchivalDirectory(soptions.wal_dir); } @@ -4660,7 +4676,10 @@ Status DestroyDB(const std::string& dbname, const Options& options, // processed and removed before those otherwise we have issues // removing them std::vector archiveFiles; - if (env->GetChildren(archivedir, &archiveFiles).ok()) { + if (soptions.fs + ->GetChildren(archivedir, io_opts, &archiveFiles, + /*IODebugContext*=*/nullptr) + .ok()) { // Delete archival files. for (const auto& file : archiveFiles) { if (ParseFileName(file, &number, &type) && type == kWalFile) { @@ -4801,7 +4820,10 @@ Status DBImpl::DeleteObsoleteOptionsFiles() { // to the oldest. std::map options_filenames; Status s; - s = GetEnv()->GetChildren(GetName(), &filenames); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs_->GetChildren(GetName(), io_opts, &filenames, + /*IODebugContext*=*/nullptr); if (!s.ok()) { return s; } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index da87aa5cd..058df4da7 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -193,11 +193,14 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } } + IOOptions io_opts; + io_opts.do_not_recurse = true; for (auto& path : paths) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - Status s = env_->GetChildren(path, &files); + Status s = immutable_db_options_.fs->GetChildren( + path, io_opts, &files, /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& file : files) { uint64_t number; @@ -223,7 +226,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // Add log files in wal_dir if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) { std::vector log_files; - Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files); + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.wal_dir, io_opts, &log_files, + /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& log_file : log_files) { job_context->full_scan_candidate_files.emplace_back( @@ -235,8 +240,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, if (!immutable_db_options_.db_log_dir.empty() && immutable_db_options_.db_log_dir != dbname_) { std::vector info_log_files; - Status s = - env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + Status s = immutable_db_options_.fs->GetChildren( + immutable_db_options_.db_log_dir, io_opts, &info_log_files, + /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); // TODO: What should we do on error? for (std::string& log_file : info_log_files) { job_context->full_scan_candidate_files.emplace_back( diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 498c92b39..199a066b3 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -163,8 +163,11 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, // DeleteScheduler::CleanupDirectory on the same dir later, it will be // safe std::vector filenames; + IOOptions io_opts; + io_opts.do_not_recurse = true; auto wal_dir = immutable_db_options.GetWalDir(); - Status s = result.env->GetChildren(wal_dir, &filenames); + Status s = immutable_db_options.fs->GetChildren( + wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr); s.PermitUncheckedError(); //**TODO: What to do on error? for (std::string& filename : filenames) { if (filename.find(".log.trash", filename.length() - @@ -432,7 +435,10 @@ Status DBImpl::Recover( s = env_->FileExists(current_fname); } else { s = Status::NotFound(); - Status io_s = env_->GetChildren(dbname_, &files_in_dbname); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status io_s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); if (!io_s.ok()) { s = io_s; files_in_dbname.clear(); @@ -498,7 +504,10 @@ Status DBImpl::Recover( } } else if (immutable_db_options_.best_efforts_recovery) { assert(files_in_dbname.empty()); - Status s = env_->GetChildren(dbname_, &files_in_dbname); + IOOptions io_opts; + io_opts.do_not_recurse = true; + Status s = immutable_db_options_.fs->GetChildren( + dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr); if (s.IsNotFound()) { return Status::InvalidArgument(dbname_, "does not exist (open for read only)"); @@ -570,7 +579,10 @@ Status DBImpl::Recover( // produced by an older version of rocksdb. auto wal_dir = immutable_db_options_.GetWalDir(); if (!immutable_db_options_.best_efforts_recovery) { - s = env_->GetChildren(wal_dir, &files_in_wal_dir); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr); } if (s.IsNotFound()) { return Status::InvalidArgument("wal_dir not found", wal_dir); @@ -678,7 +690,10 @@ Status DBImpl::Recover( } else if (normalized_dbname == normalized_wal_dir) { filenames = std::move(files_in_wal_dir); } else { - s = env_->GetChildren(GetName(), &filenames); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren( + GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr); } } if (s.ok()) { @@ -2022,9 +2037,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // Remove duplicate paths. std::sort(paths.begin(), paths.end()); paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + IOOptions io_opts; + io_opts.do_not_recurse = true; for (auto& path : paths) { std::vector existing_files; - impl->immutable_db_options_.env->GetChildren(path, &existing_files) + impl->immutable_db_options_.fs + ->GetChildren(path, io_opts, &existing_files, + /*IODebugContext*=*/nullptr) .PermitUncheckedError(); //**TODO: What do to on error? for (auto& file_name : existing_files) { uint64_t file_number; diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 40e85ee85..f009352c2 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -100,7 +100,11 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { assert(logs != nullptr); std::vector filenames; Status s; - s = env_->GetChildren(immutable_db_options_.GetWalDir(), &filenames); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(), + io_opts, &filenames, + /*IODebugContext*=*/nullptr); if (s.IsNotFound()) { return Status::InvalidArgument("Failed to open wal_dir", immutable_db_options_.GetWalDir()); diff --git a/env/fs_posix.cc b/env/fs_posix.cc index a6f7b9c08..e179a421d 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -575,7 +575,7 @@ class PosixFileSystem : public FileSystem { } } - IOStatus GetChildren(const std::string& dir, const IOOptions& /*opts*/, + IOStatus GetChildren(const std::string& dir, const IOOptions& opts, std::vector* result, IODebugContext* /*dbg*/) override { result->clear(); @@ -595,12 +595,20 @@ class PosixFileSystem : public FileSystem { // reset errno before calling readdir() errno = 0; struct dirent* entry; + while ((entry = readdir(d)) != nullptr) { // filter out '.' and '..' directory entries // which appear only on some platforms const bool ignore = entry->d_type == DT_DIR && - (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0); + (strcmp(entry->d_name, ".") == 0 || + strcmp(entry->d_name, "..") == 0 +#ifndef ASSERT_STATUS_CHECKED + // In case of ASSERT_STATUS_CHECKED, GetChildren support older + // version of API for debugging purpose. + || opts.do_not_recurse +#endif + ); if (!ignore) { result->push_back(entry->d_name); } diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 4933c15b5..300bf0f8f 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -141,7 +141,11 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm, Status s; // Check if there are any files marked as trash in this path std::vector files_in_path; - s = env->GetChildren(path, &files_in_path); + const auto& fs = env->GetFileSystem(); + IOOptions io_opts; + io_opts.do_not_recurse = true; + s = fs->GetChildren(path, io_opts, &files_in_path, + /*IODebugContext*=*/nullptr); if (!s.ok()) { return s; } diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index ee8362eab..547faa5a4 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -112,6 +112,10 @@ struct IOOptions { // fsync, set this to force the fsync bool force_dir_fsync; + // Can be used by underlying file systems to skip recursing through sub + // directories and list only files in GetChildren API. + bool do_not_recurse; + IOOptions() : IOOptions(false) {} explicit IOOptions(bool force_dir_fsync_) @@ -119,7 +123,8 @@ struct IOOptions { prio(IOPriority::kIOLow), rate_limiter_priority(Env::IO_TOTAL), type(IOType::kUnknown), - force_dir_fsync(force_dir_fsync_) {} + force_dir_fsync(force_dir_fsync_), + do_not_recurse(false) {} }; struct DirFsyncOptions {