Optimize GetApproximateSizes() to use lesser CPU cycles.

Summary: CPU profiling reveals GetApproximateSizes as a bottleneck for performance. The current implementation is sub-optimal, it scans every file in every level to compute the result. We can take advantage of the fact that all levels above 0 are sorted in the increasing order of key ranges and use binary search to locate the starting index. This can reduce the number of comparisons required to compute the result. Test Plan: We have good test coverage. Run the tests. Reviewers: sdong, igor, rven, dynamike Subscribers: dynamike, maykov, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37755
11 years ago · d4540654e9
parent fd96b55402
commit d4540654e9
3 changed files with 102 additions and 39 deletions
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -3587,7 +3587,6 @@ void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
 void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
                                 const Range* range, int n, uint64_t* sizes) {
  // TODO(opt): better implementation
  Version* v;
  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
@ -3599,12 +3598,9 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
  for (int i = 0; i < n; i++) {
    // Convert user_key into a corresponding internal key.
-    InternalKey k1, k2;
+    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
-    k1.SetMaxPossibleForUserKey(range[i].start);
+    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
-    k2.SetMaxPossibleForUserKey(range[i].limit);
+    sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
    uint64_t start = versions_->ApproximateOffsetOf(v, k1);
    uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
    sizes[i] = (limit >= start ? limit - start : 0);
  }
  {
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -2802,40 +2802,101 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
  return result;
 }
 uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
                                     const Slice& end) {
  // pre-condition
  assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
-uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+  uint64_t size = 0;
  uint64_t result = 0;
  const auto* vstorage = v->storage_info();
-  for (int level = 0; level < vstorage->num_levels(); level++) {
+
-    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
+  for (int level = 0; level < vstorage->num_non_empty_levels(); level++) {
-    for (size_t i = 0; i < files.size(); i++) {
+    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
-      if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
+    if (!files_brief.num_files) {
-          0) {
+      // empty level, skip exploration
-        // Entire file is before "ikey", so just add the file size
+      continue;
-        result += files[i]->fd.GetFileSize();
+    }
-      } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest,
+
-                                                        ikey) > 0) {
+    if (!level) {
-        // Entire file is after "ikey", so ignore
+      // level 0 data is sorted order, handle the use case explicitly
-        if (level > 0) {
+      size += ApproximateSizeLevel0(v, files_brief, start, end);
-          // Files other than level 0 are sorted by meta->smallest, so
+      continue;
-          // no further files in this level will contain data for
+    }
-          // "ikey".
+
-          break;
+    assert(level > 0);
-        }
+    assert(files_brief.num_files > 0);
-      } else {
+
-        // "ikey" falls in the range for this table.  Add the
+    // identify the file position for starting key
-        // approximate offset of "ikey" within the table.
+    const uint64_t idx_start =
-        TableReader* table_reader_ptr;
+        FindFileInRange(v->cfd_->internal_comparator(), files_brief, start,
-        Iterator* iter = v->cfd_->table_cache()->NewIterator(
+                        /*start=*/0, files_brief.num_files - 1);
-            ReadOptions(), env_options_, v->cfd_->internal_comparator(),
+    assert(idx_start < files_brief.num_files);
-            files[i]->fd, &table_reader_ptr);
+
-        if (table_reader_ptr != nullptr) {
+    // scan all files from the starting position until the ending position
-          result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
+    // inferred from the sorted order
-        }
+    for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
-        delete iter;
+      uint64_t val;
      val = ApproximateSize(v, files_brief.files[i], end);
      if (!val) {
        // the files after this will not have the range
        break;
      }
      size += val;
      if (i == idx_start) {
        // subtract the bytes needed to be scanned to get to the starting
        // key
        val = ApproximateSize(v, files_brief.files[i], start);
        assert(size >= val);
        size -= val;
      }
    }
  }
  return size;
 }
 uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
                                           const LevelFilesBrief& files_brief,
                                           const Slice& key_start,
                                           const Slice& key_end) {
  // level 0 files are not in sorted order, we need to iterate through
  // the list to compute the total bytes that require scanning
  uint64_t size = 0;
  for (size_t i = 0; i < files_brief.num_files; i++) {
    const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start);
    const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end);
    assert(end >= start);
    size += end - start;
  }
  return size;
 }
 uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
                                     const Slice& key) {
  // pre-condition
  assert(v);
  uint64_t result = 0;
  if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
    // Entire file is before "key", so just add the file size
    result = f.fd.GetFileSize();
  } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
    // Entire file is after "key", so ignore
    result = 0;
  } else {
    // "key" falls in the range for this table.  Add the
    // approximate offset of "key" within the table.
    TableReader* table_reader_ptr;
    Iterator* iter = v->cfd_->table_cache()->NewIterator(
        ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
        &table_reader_ptr);
    if (table_reader_ptr != nullptr) {
      result = table_reader_ptr->ApproximateOffsetOf(key);
    }
    delete iter;
  }
  return result;
 }
--- a/db/version_set.h
+++ b/db/version_set.h
@ -618,9 +618,8 @@ class VersionSet {
  // Add all files listed in any live version to *live.
  void AddLiveFiles(std::vector<FileDescriptor>* live_list);
-  // Return the approximate offset in the database of the data for
+  // Return the approximate size of data to be scanned for range [start, end)
-  // "key" as of version "v".
+  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end);
  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
  // Return the size of the current manifest file
  uint64_t manifest_file_size() const { return manifest_file_size_; }
@ -657,6 +656,13 @@ class VersionSet {
    }
  };
  // ApproximateSize helper
  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
                                 const Slice& start, const Slice& end);
  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
                           const Slice& key);
  // Save current contents to *log
  Status WriteSnapshot(log::Writer* log);