Improve CPU Efficiency of ApproximateSize (part 2) (#5609)

Summary: In some cases, we don't have to get really accurate number. Something like 10% off is fine, we can create a new option for that use case. In this case, we can calculate size for full files first, and avoid estimation inside SST files if full files got us a huge number. For example, if we already covered 100GB of data, we should be able to skip partial dives into 10 SST files of 30MB. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5609 Differential Revision: D16433481 Pulled By: elipoz fbshipit-source-id: 5830b31e1c656d0fd3a00d7fd2678ddc8f6e601b
6 years ago · 4834dab578
parent b538e756c2
commit 4834dab578
7 changed files with 208 additions and 75 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -22,6 +22,7 @@
 * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
 * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
 * Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
 * Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
 ### Performance Improvements
 * Reduce iterator key comparision for upper/lower bound check.
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() {
    // to the index block and may incur I/O cost in the process. Unlock db
    // mutex to reduce contention
    db_mutex_->Unlock();
-    uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1,
+    uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
                                               b, start_lvl, out_lvl + 1,
                                               TableReaderCaller::kCompaction);
    db_mutex_->Lock();
    ranges.emplace_back(a, b, size);
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -2808,8 +2808,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
    sizes[i] = 0;
    if (options.include_files) {
      sizes[i] += versions_->ApproximateSize(
-          v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1,
+          options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
-          TableReaderCaller::kUserApproximateSize);
+          /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
    }
    if (options.include_memtabtles) {
      sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -1257,6 +1257,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
  options.compression = kNoCompression;
  options.create_if_missing = true;
  DestroyAndReopen(options);
  auto default_cf = db_->DefaultColumnFamily();
  const int N = 128;
  Random rnd(301);
@ -1268,9 +1269,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
  std::string start = Key(50);
  std::string end = Key(60);
  Range r(start, end);
-  uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES |
+  SizeApproximationOptions size_approx_options;
-                         DB::SizeApproximationFlags::INCLUDE_MEMTABLES;
+  size_approx_options.include_memtabtles = true;
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  size_approx_options.include_files = true;
  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_GT(size, 6000);
  ASSERT_LT(size, 204800);
  // Zero if not including mem table
@ -1280,7 +1282,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
  start = Key(500);
  end = Key(600);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_EQ(size, 0);
  for (int i = 0; i < N; i++) {
@ -1290,19 +1292,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
  start = Key(500);
  end = Key(600);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_EQ(size, 0);
  start = Key(100);
  end = Key(1020);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_GT(size, 6000);
  options.max_write_buffer_number = 8;
  options.min_write_buffer_number_to_merge = 5;
  options.write_buffer_size = 1024 * N;  // Not very large
  DestroyAndReopen(options);
  default_cf = db_->DefaultColumnFamily();
  int keys[N * 3];
  for (int i = 0; i < N; i++) {
@ -1319,26 +1322,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
  start = Key(100);
  end = Key(300);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_EQ(size, 0);
  start = Key(1050);
  end = Key(1080);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_GT(size, 6000);
  start = Key(2100);
  end = Key(2300);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_EQ(size, 0);
  start = Key(1050);
  end = Key(1080);
  r = Range(start, end);
  uint64_t size_with_mt, size_without_mt;
-  db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
                           &size_with_mt);
  ASSERT_GT(size_with_mt, 6000);
  db_->GetApproximateSizes(&r, 1, &size_without_mt);
  ASSERT_EQ(size_without_mt, 0);
@ -1352,10 +1356,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
  start = Key(1050);
  end = Key(1080);
  r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
                           &size_with_mt);
  db_->GetApproximateSizes(&r, 1, &size_without_mt);
  ASSERT_GT(size_with_mt, size_without_mt);
  ASSERT_GT(size_without_mt, 6000);
  // Check that include_memtabtles flag works as expected
  size_approx_options.include_memtabtles = false;
  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_EQ(size, size_without_mt);
  // Check that files_size_error_margin works as expected, when the heuristic
  // conditions are not met
  start = Key(1);
  end = Key(1000 + N - 2);
  r = Range(start, end);
  size_approx_options.files_size_error_margin = -1.0;  // disabled
  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  uint64_t size2;
  size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
  ASSERT_EQ(size, size2);
 }
 TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
  Options options = CurrentOptions();
  options.write_buffer_size = 1024 * 1024;
  options.compression = kNoCompression;
  options.create_if_missing = true;
  options.target_file_size_base = 1024 * 1024;
  DestroyAndReopen(options);
  const auto default_cf = db_->DefaultColumnFamily();
  const int N = 64000;
  Random rnd(301);
  for (int i = 0; i < N; i++) {
    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
  }
  // Flush everything to files
  Flush();
  // Compact the entire key space into the next level
  db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
  // Write more keys
  for (int i = N; i < (N + N / 4); i++) {
    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
  }
  // Flush everything to files again
  Flush();
  // Wait for compaction to finish
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  const std::string start = Key(0);
  const std::string end = Key(2 * N);
  const Range r(start, end);
  SizeApproximationOptions size_approx_options;
  size_approx_options.include_memtabtles = false;
  size_approx_options.include_files = true;
  size_approx_options.files_size_error_margin = -1.0;  // disabled
  // Get the precise size without any approximation heuristic
  uint64_t size;
  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
  ASSERT_NE(size, 0);
  // Get the size with an approximation heuristic
  uint64_t size2;
  const double error_margin = 0.2;
  size_approx_options.files_size_error_margin = error_margin;
  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
  ASSERT_LT(size2, size * (1 + error_margin));
  ASSERT_GT(size2, size * (1 - error_margin));
 }
 TEST_F(DBTest, GetApproximateMemTableStats) {
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
 // we avoid doing binary search for the keys b and c twice and instead somehow
 // maintain state of where they first appear in the files.
-uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
                                     Version* v, const Slice& start,
                                     const Slice& end, int start_level,
                                     int end_level, TableReaderCaller caller) {
  const auto& icmp = v->cfd_->internal_comparator();
  // pre-condition
-  assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
+  assert(icmp.Compare(start, end) <= 0);
-  uint64_t size = 0;
+  uint64_t total_full_size = 0;
  const auto* vstorage = v->storage_info();
-  end_level = end_level == -1
+  const int num_non_empty_levels = vstorage->num_non_empty_levels();
-                  ? vstorage->num_non_empty_levels()
+  end_level = (end_level == -1) ? num_non_empty_levels
-                  : std::min(end_level, vstorage->num_non_empty_levels());
+                                : std::min(end_level, num_non_empty_levels);
  assert(start_level <= end_level);
-  for (int level = start_level; level < end_level; level++) {
+  // Outline of the optimization that uses options.files_size_error_margin.
  // When approximating the files total size that is used to store a keys range,
  // we first sum up the sizes of the files that fully fall into the range.
  // Then we sum up the sizes of all the files that may intersect with the range
  // (this includes all files in L0 as well). Then, if total_intersecting_size
  // is smaller than total_full_size * options.files_size_error_margin - we can
  // infer that the intersecting files have a sufficiently negligible
  // contribution to the total size, and we can approximate the storage required
  // for the keys in range as just half of the intersecting_files_size.
  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
  // approximation is limited to only ~10% of the total size of files that fully
  // fall into the keys range. In such case, this helps to avoid a costly
  // process of binary searching the intersecting files that is required only
  // for a more precise calculation of the total size.
  autovector<FdWithKeyRange*, 32> first_files;
  autovector<FdWithKeyRange*, 16> last_files;
  // scan all the levels
  for (int level = start_level; level < end_level; ++level) {
    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
-    if (!files_brief.num_files) {
+    if (files_brief.num_files == 0) {
      // empty level, skip exploration
      continue;
    }
-    if (!level) {
+    if (level == 0) {
-      // level 0 data is sorted order, handle the use case explicitly
+      // level 0 files are not in sorted order, we need to iterate through
-      size += ApproximateSizeLevel0(v, files_brief, start, end, caller);
+      // the list to compute the total bytes that require scanning,
      // so handle the case explicitly (similarly to first_files case)
      for (size_t i = 0; i < files_brief.num_files; i++) {
        first_files.push_back(&files_brief.files[i]);
      }
      continue;
    }
    assert(level > 0);
    assert(files_brief.num_files > 0);
-    // identify the file position for starting key
+    // identify the file position for start key
-    const uint64_t idx_start = FindFileInRange(
+    const int idx_start =
-        v->cfd_->internal_comparator(), files_brief, start,
+        FindFileInRange(icmp, files_brief, start, 0,
-        /*start=*/0, static_cast<uint32_t>(files_brief.num_files - 1));
+                        static_cast<uint32_t>(files_brief.num_files - 1));
-    assert(idx_start < files_brief.num_files);
+    assert(static_cast<size_t>(idx_start) < files_brief.num_files);
    // scan all files from the starting position until the ending position
    // inferred from the sorted order
    for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
      uint64_t val;
      val = ApproximateSize(v, files_brief.files[i], end, caller);
      if (!val) {
        // the files after this will not have the range
        break;
      }
-      size += val;
+    // identify the file position for end key
    int idx_end = idx_start;
    if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
      idx_end =
          FindFileInRange(icmp, files_brief, end, idx_start,
                          static_cast<uint32_t>(files_brief.num_files - 1));
    }
    assert(idx_end >= idx_start &&
           static_cast<size_t>(idx_end) < files_brief.num_files);
-      if (i == idx_start) {
+    // scan all files from the starting index to the ending index
-        // subtract the bytes needed to be scanned to get to the starting
+    // (inferred from the sorted order)
-        // key
+
-        val = ApproximateSize(v, files_brief.files[i], start, caller);
+    // first scan all the intermediate full files (excluding first and last)
-        assert(size >= val);
+    for (int i = idx_start + 1; i < idx_end; ++i) {
-        size -= val;
+      uint64_t file_size = files_brief.files[i].fd.GetFileSize();
-      }
+      // The entire file falls into the range, so we can just take its size.
      assert(file_size ==
             ApproximateSize(v, files_brief.files[i], end, caller));
      total_full_size += file_size;
    }
    // save the first and the last files (which may be the same file), so we
    // can scan them later.
    first_files.push_back(&files_brief.files[idx_start]);
    if (idx_start != idx_end) {
      // we need to estimate size for both files, only if they are different
      last_files.push_back(&files_brief.files[idx_end]);
    }
  }
-  return size;
+  // The sum of all file sizes that intersect the [start, end] keys range.
-}
+  uint64_t total_intersecting_size = 0;
  for (const auto* file_ptr : first_files) {
    total_intersecting_size += file_ptr->fd.GetFileSize();
  }
  for (const auto* file_ptr : last_files) {
    total_intersecting_size += file_ptr->fd.GetFileSize();
  }
-uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
+  // Now scan all the first & last files at each level, and estimate their size.
-                                           const LevelFilesBrief& files_brief,
+  // If the total_intersecting_size is less than X% of the total_full_size - we
-                                           const Slice& key_start,
+  // want to approximate the result in order to avoid the costly binary search
-                                           const Slice& key_end,
+  // inside ApproximateSize. We use half of file size as an approximation below.
-                                           TableReaderCaller caller) {
+
-  // level 0 files are not in sorted order, we need to iterate through
+  const double margin = options.files_size_error_margin;
-  // the list to compute the total bytes that require scanning
+  if (margin > 0 && total_intersecting_size <
-  uint64_t size = 0;
+                        static_cast<uint64_t>(total_full_size * margin)) {
-  for (size_t i = 0; i < files_brief.num_files; i++) {
+    total_full_size += total_intersecting_size / 2;
-    const uint64_t start =
+  } else {
-        ApproximateSize(v, files_brief.files[i], key_start, caller);
+    // Estimate for all the first files, at each level
-    const uint64_t end =
+    for (const auto file_ptr : first_files) {
-        ApproximateSize(v, files_brief.files[i], key_end, caller);
+      total_full_size += ApproximateSize(v, *file_ptr, end, caller);
-    assert(end >= start);
+      // subtract the bytes needed to be scanned to get to the starting key
-    size += end - start;
+      uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
      assert(total_full_size >= val);
      total_full_size -= val;
    }
    // Estimate for all the last files, at each level
    for (const auto file_ptr : last_files) {
      total_full_size += ApproximateSize(v, *file_ptr, end, caller);
    }
  }
-  return size;
+
  return total_full_size;
 }
 uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
                                     TableReaderCaller caller) {
  // pre-condition
  assert(v);
  const auto& icmp = v->cfd_->internal_comparator();
  uint64_t result = 0;
-  if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
+  if (icmp.Compare(f.largest_key, key) <= 0) {
    // Entire file is before "key", so just add the file size
    result = f.fd.GetFileSize();
-  } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
+  } else if (icmp.Compare(f.smallest_key, key) > 0) {
    // Entire file is after "key", so ignore
    result = 0;
  } else {
@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
    TableCache* table_cache = v->cfd_->table_cache();
    if (table_cache != nullptr) {
      result = table_cache->ApproximateOffsetOf(
-          key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(),
+          key, f.file_metadata->fd, caller, icmp,
          v->GetMutableCFOptions().prefix_extractor.get());
    }
  }
--- a/db/version_set.h
+++ b/db/version_set.h
@ -983,7 +983,8 @@ class VersionSet {
  // Return the approximate size of data to be scanned for range [start, end)
  // in levels [start_level, end_level). If end_level == -1 it will search
  // through all non-empty levels
-  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
+  uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
                           const Slice& start, const Slice& end,
                           int start_level, int end_level,
                           TableReaderCaller caller);
@ -1033,11 +1034,6 @@ class VersionSet {
    }
  };
  // ApproximateSize helper
  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
                                 const Slice& start, const Slice& end,
                                 TableReaderCaller caller);
  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
                           const Slice& key, TableReaderCaller caller);
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -1514,6 +1514,16 @@ struct SizeApproximationOptions {
  // Defines whether the returned size should include data serialized to disk.
  // If set to false, include_memtabtles must be true.
  bool include_files = true;
  // When approximating the files total size that is used to store a keys range
  // using DB::GetApproximateSizes, allow approximation with an error margin of
  // up to total_files_size * files_size_error_margin. This allows to take some
  // shortcuts in files size approximation, resulting in better performance,
  // while guaranteeing the resulting error is within a reasonable margin.
  // E.g., if the value is 0.1, then the error margin of the returned files size
  // approximation will be within 10%.
  // If the value is non-positive - a more precise yet more CPU intensive
  // estimation is performed.
  double files_size_error_margin = -1.0;
 };
 }  // namespace rocksdb