Avoid lots of calls to Env::GetFileSize() in SstFileManagerImpl when opening DB (#6363)

Summary:
Before this PR it calls GetFileSize() once for each sst file in the DB. This can take a long time if there are be tens of thousands of sst files (e.g. in thousands of column families), and even longer if Env is talking to some remote service rather than local filesystem. This PR makes DB::Open() use sst file sizes that are already known from manifest (typically almost all files in the DB) and only call GetFileSize() for non-sst or obsolete files. Note that GetFileSize() is also called and checked against manifest in CheckConsistency(), so the calls in SstFileManagerImpl were completely redundant.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6363

Test Plan: deployed to a test cluster, looked at a dump of Env calls (from a custom instrumented Env) - no more thousands of GetFileSize()s.

Differential Revision: D19702509

Pulled By: al13n321

fbshipit-source-id: 99f8110620cb2e9d0c092dfcdbb11f3af4ff8b73
main
Mike Kolupaev 4 years ago committed by Facebook Github Bot
parent 3a073234da
commit 1ed7d9b1b5
  1. 30
      db/db_impl/db_impl_open.cc
  2. 8
      file/sst_file_manager_impl.cc
  3. 5
      file/sst_file_manager_impl.h

@ -1553,6 +1553,27 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
if (s.ok() && sfm) {
// Notify SstFileManager about all sst files that already exist in
// db_paths[0] and cf_paths[0] when the DB is opened.
// SstFileManagerImpl needs to know sizes of the files. For files whose size
// we already know (sst files that appear in manifest - typically that's the
// vast majority of all files), we'll pass the size to SstFileManager.
// For all other files SstFileManager will query the size from filesystem.
std::vector<LiveFileMetaData> metadata;
impl->mutex_.Lock();
impl->versions_->GetLiveFilesMetaData(&metadata);
impl->mutex_.Unlock();
std::unordered_map<std::string, uint64_t> known_file_sizes;
for (const auto& md : metadata) {
std::string name = md.name;
if (!name.empty() && name[0] == '/') {
name = name.substr(1);
}
known_file_sizes[name] = md.size;
}
std::vector<std::string> paths;
paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
for (auto& cf : column_families) {
@ -1572,7 +1593,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
std::string file_path = path + "/" + file_name;
if (ParseFileName(file_name, &file_number, &file_type) &&
file_type == kTableFile) {
sfm->OnAddFile(file_path);
if (known_file_sizes.count(file_name)) {
// We're assuming that each sst file name exists in at most one of
// the paths.
sfm->OnAddFile(file_path, known_file_sizes.at(file_name),
/* compaction */ false);
} else {
sfm->OnAddFile(file_path);
}
}
}
}

@ -71,6 +71,14 @@ Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
return s;
}
Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
uint64_t file_size, bool compaction) {
MutexLock l(&mu_);
OnAddFileImpl(file_path, file_size, compaction);
TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
return Status::OK();
}
Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
{
MutexLock l(&mu_);

@ -38,6 +38,11 @@ class SstFileManagerImpl : public SstFileManager {
// DB will call OnAddFile whenever a new sst file is added.
Status OnAddFile(const std::string& file_path, bool compaction = false);
// Overload where size of the file is provided by the caller rather than
// queried from the filesystem. This is an optimization.
Status OnAddFile(const std::string& file_path, uint64_t file_size,
bool compaction);
// DB will call OnDeleteFile whenever an sst file is deleted.
Status OnDeleteFile(const std::string& file_path);

Loading…
Cancel
Save