From 4834dab578114b429163746acbcb93073bb5784f Mon Sep 17 00:00:00 2001 From: Eli Pozniansky Date: Wed, 31 Jul 2019 08:46:48 -0700 Subject: [PATCH] Improve CPU Efficiency of ApproximateSize (part 2) (#5609) Summary: In some cases, we don't have to get really accurate number. Something like 10% off is fine, we can create a new option for that use case. In this case, we can calculate size for full files first, and avoid estimation inside SST files if full files got us a huge number. For example, if we already covered 100GB of data, we should be able to skip partial dives into 10 SST files of 30MB. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5609 Differential Revision: D16433481 Pulled By: elipoz fbshipit-source-id: 5830b31e1c656d0fd3a00d7fd2678ddc8f6e601b --- HISTORY.md | 1 + db/compaction/compaction_job.cc | 3 +- db/db_impl/db_impl.cc | 4 +- db/db_test.cc | 96 ++++++++++++++++--- db/version_set.cc | 161 +++++++++++++++++++++----------- db/version_set.h | 8 +- include/rocksdb/options.h | 10 ++ 7 files changed, 208 insertions(+), 75 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9e057250a..201cef2b1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,7 @@ * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. * Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. +* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin. ### Performance Improvements * Reduce iterator key comparision for upper/lower bound check. diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index db701d19d..663c8aa0a 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() { // to the index block and may incur I/O cost in the process. Unlock db // mutex to reduce contention db_mutex_->Unlock(); - uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1, + uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a, + b, start_lvl, out_lvl + 1, TableReaderCaller::kCompaction); db_mutex_->Lock(); ranges.emplace_back(a, b, size); diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 29b7f6f14..81c44388b 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2808,8 +2808,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( - v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, - TableReaderCaller::kUserApproximateSize); + options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } if (options.include_memtabtles) { sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; diff --git a/db/db_test.cc b/db/db_test.cc index f247ddb80..f53afa17d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1257,6 +1257,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) { options.compression = kNoCompression; options.create_if_missing = true; DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); const int N = 128; Random rnd(301); @@ -1268,9 +1269,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) { std::string start = Key(50); std::string end = Key(60); Range r(start, end); - uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES | - DB::SizeApproximationFlags::INCLUDE_MEMTABLES; - db_->GetApproximateSizes(&r, 1, &size, include_both); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = true; + size_approx_options.include_files = true; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); ASSERT_LT(size, 204800); // Zero if not including mem table @@ -1280,7 +1282,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); for (int i = 0; i < N; i++) { @@ -1290,19 +1292,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(100); end = Key(1020); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); options.max_write_buffer_number = 8; options.min_write_buffer_number_to_merge = 5; options.write_buffer_size = 1024 * N; // Not very large DestroyAndReopen(options); + default_cf = db_->DefaultColumnFamily(); int keys[N * 3]; for (int i = 0; i < N; i++) { @@ -1319,26 +1322,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(100); end = Key(300); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); start = Key(2100); end = Key(2300); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); ASSERT_GT(size_with_mt, 6000); db_->GetApproximateSizes(&r, 1, &size_without_mt); ASSERT_EQ(size_without_mt, 0); @@ -1352,10 +1356,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); db_->GetApproximateSizes(&r, 1, &size_without_mt); ASSERT_GT(size_with_mt, size_without_mt); ASSERT_GT(size_without_mt, 6000); + + // Check that include_memtabtles flag works as expected + size_approx_options.include_memtabtles = false; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, size_without_mt); + + // Check that files_size_error_margin works as expected, when the heuristic + // conditions are not met + start = Key(1); + end = Key(1000 + N - 2); + r = Range(start, end); + size_approx_options.files_size_error_margin = -1.0; // disabled + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + uint64_t size2; + size_approx_options.files_size_error_margin = 0.5; // enabled, but not used + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_EQ(size, size2); +} + +TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { + Options options = CurrentOptions(); + options.write_buffer_size = 1024 * 1024; + options.compression = kNoCompression; + options.create_if_missing = true; + options.target_file_size_base = 1024 * 1024; + DestroyAndReopen(options); + const auto default_cf = db_->DefaultColumnFamily(); + + const int N = 64000; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files + Flush(); + // Compact the entire key space into the next level + db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr); + + // Write more keys + for (int i = N; i < (N + N / 4); i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files again + Flush(); + + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + const std::string start = Key(0); + const std::string end = Key(2 * N); + const Range r(start, end); + + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = false; + size_approx_options.include_files = true; + size_approx_options.files_size_error_margin = -1.0; // disabled + + // Get the precise size without any approximation heuristic + uint64_t size; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_NE(size, 0); + + // Get the size with an approximation heuristic + uint64_t size2; + const double error_margin = 0.2; + size_approx_options.files_size_error_margin = error_margin; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_LT(size2, size * (1 + error_margin)); + ASSERT_GT(size2, size * (1 - error_margin)); } TEST_F(DBTest, GetApproximateMemTableStats) { diff --git a/db/version_set.cc b/db/version_set.cc index 7d477a680..3a1f47790 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where // we avoid doing binary search for the keys b and c twice and instead somehow // maintain state of where they first appear in the files. -uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, +uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller) { + const auto& icmp = v->cfd_->internal_comparator(); + // pre-condition - assert(v->cfd_->internal_comparator().Compare(start, end) <= 0); + assert(icmp.Compare(start, end) <= 0); - uint64_t size = 0; + uint64_t total_full_size = 0; const auto* vstorage = v->storage_info(); - end_level = end_level == -1 - ? vstorage->num_non_empty_levels() - : std::min(end_level, vstorage->num_non_empty_levels()); + const int num_non_empty_levels = vstorage->num_non_empty_levels(); + end_level = (end_level == -1) ? num_non_empty_levels + : std::min(end_level, num_non_empty_levels); assert(start_level <= end_level); - for (int level = start_level; level < end_level; level++) { + // Outline of the optimization that uses options.files_size_error_margin. + // When approximating the files total size that is used to store a keys range, + // we first sum up the sizes of the files that fully fall into the range. + // Then we sum up the sizes of all the files that may intersect with the range + // (this includes all files in L0 as well). Then, if total_intersecting_size + // is smaller than total_full_size * options.files_size_error_margin - we can + // infer that the intersecting files have a sufficiently negligible + // contribution to the total size, and we can approximate the storage required + // for the keys in range as just half of the intersecting_files_size. + // E.g., if the value of files_size_error_margin is 0.1, then the error of the + // approximation is limited to only ~10% of the total size of files that fully + // fall into the keys range. In such case, this helps to avoid a costly + // process of binary searching the intersecting files that is required only + // for a more precise calculation of the total size. + + autovector first_files; + autovector last_files; + + // scan all the levels + for (int level = start_level; level < end_level; ++level) { const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); - if (!files_brief.num_files) { + if (files_brief.num_files == 0) { // empty level, skip exploration continue; } - if (!level) { - // level 0 data is sorted order, handle the use case explicitly - size += ApproximateSizeLevel0(v, files_brief, start, end, caller); + if (level == 0) { + // level 0 files are not in sorted order, we need to iterate through + // the list to compute the total bytes that require scanning, + // so handle the case explicitly (similarly to first_files case) + for (size_t i = 0; i < files_brief.num_files; i++) { + first_files.push_back(&files_brief.files[i]); + } continue; } assert(level > 0); assert(files_brief.num_files > 0); - // identify the file position for starting key - const uint64_t idx_start = FindFileInRange( - v->cfd_->internal_comparator(), files_brief, start, - /*start=*/0, static_cast(files_brief.num_files - 1)); - assert(idx_start < files_brief.num_files); - - // scan all files from the starting position until the ending position - // inferred from the sorted order - for (uint64_t i = idx_start; i < files_brief.num_files; i++) { - uint64_t val; - val = ApproximateSize(v, files_brief.files[i], end, caller); - if (!val) { - // the files after this will not have the range - break; - } + // identify the file position for start key + const int idx_start = + FindFileInRange(icmp, files_brief, start, 0, + static_cast(files_brief.num_files - 1)); + assert(static_cast(idx_start) < files_brief.num_files); - size += val; + // identify the file position for end key + int idx_end = idx_start; + if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { + idx_end = + FindFileInRange(icmp, files_brief, end, idx_start, + static_cast(files_brief.num_files - 1)); + } + assert(idx_end >= idx_start && + static_cast(idx_end) < files_brief.num_files); - if (i == idx_start) { - // subtract the bytes needed to be scanned to get to the starting - // key - val = ApproximateSize(v, files_brief.files[i], start, caller); - assert(size >= val); - size -= val; - } + // scan all files from the starting index to the ending index + // (inferred from the sorted order) + + // first scan all the intermediate full files (excluding first and last) + for (int i = idx_start + 1; i < idx_end; ++i) { + uint64_t file_size = files_brief.files[i].fd.GetFileSize(); + // The entire file falls into the range, so we can just take its size. + assert(file_size == + ApproximateSize(v, files_brief.files[i], end, caller)); + total_full_size += file_size; + } + + // save the first and the last files (which may be the same file), so we + // can scan them later. + first_files.push_back(&files_brief.files[idx_start]); + if (idx_start != idx_end) { + // we need to estimate size for both files, only if they are different + last_files.push_back(&files_brief.files[idx_end]); } } - return size; -} + // The sum of all file sizes that intersect the [start, end] keys range. + uint64_t total_intersecting_size = 0; + for (const auto* file_ptr : first_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + for (const auto* file_ptr : last_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } -uint64_t VersionSet::ApproximateSizeLevel0(Version* v, - const LevelFilesBrief& files_brief, - const Slice& key_start, - const Slice& key_end, - TableReaderCaller caller) { - // level 0 files are not in sorted order, we need to iterate through - // the list to compute the total bytes that require scanning - uint64_t size = 0; - for (size_t i = 0; i < files_brief.num_files; i++) { - const uint64_t start = - ApproximateSize(v, files_brief.files[i], key_start, caller); - const uint64_t end = - ApproximateSize(v, files_brief.files[i], key_end, caller); - assert(end >= start); - size += end - start; + // Now scan all the first & last files at each level, and estimate their size. + // If the total_intersecting_size is less than X% of the total_full_size - we + // want to approximate the result in order to avoid the costly binary search + // inside ApproximateSize. We use half of file size as an approximation below. + + const double margin = options.files_size_error_margin; + if (margin > 0 && total_intersecting_size < + static_cast(total_full_size * margin)) { + total_full_size += total_intersecting_size / 2; + } else { + // Estimate for all the first files, at each level + for (const auto file_ptr : first_files) { + total_full_size += ApproximateSize(v, *file_ptr, end, caller); + // subtract the bytes needed to be scanned to get to the starting key + uint64_t val = ApproximateSize(v, *file_ptr, start, caller); + assert(total_full_size >= val); + total_full_size -= val; + } + + // Estimate for all the last files, at each level + for (const auto file_ptr : last_files) { + total_full_size += ApproximateSize(v, *file_ptr, end, caller); + } } - return size; + + return total_full_size; } uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, @@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, TableReaderCaller caller) { // pre-condition assert(v); + const auto& icmp = v->cfd_->internal_comparator(); uint64_t result = 0; - if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) { + if (icmp.Compare(f.largest_key, key) <= 0) { // Entire file is before "key", so just add the file size result = f.fd.GetFileSize(); - } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) { + } else if (icmp.Compare(f.smallest_key, key) > 0) { // Entire file is after "key", so ignore result = 0; } else { @@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, TableCache* table_cache = v->cfd_->table_cache(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( - key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(), + key, f.file_metadata->fd, caller, icmp, v->GetMutableCFOptions().prefix_extractor.get()); } } diff --git a/db/version_set.h b/db/version_set.h index ee94f5966..391bb902c 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -983,7 +983,8 @@ class VersionSet { // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels - uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, + uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller); @@ -1033,11 +1034,6 @@ class VersionSet { } }; - // ApproximateSize helper - uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, - const Slice& start, const Slice& end, - TableReaderCaller caller); - uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, const Slice& key, TableReaderCaller caller); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 5ae010b8f..bda44d441 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1514,6 +1514,16 @@ struct SizeApproximationOptions { // Defines whether the returned size should include data serialized to disk. // If set to false, include_memtabtles must be true. bool include_files = true; + // When approximating the files total size that is used to store a keys range + // using DB::GetApproximateSizes, allow approximation with an error margin of + // up to total_files_size * files_size_error_margin. This allows to take some + // shortcuts in files size approximation, resulting in better performance, + // while guaranteeing the resulting error is within a reasonable margin. + // E.g., if the value is 0.1, then the error margin of the returned files size + // approximation will be within 10%. + // If the value is non-positive - a more precise yet more CPU intensive + // estimation is performed. + double files_size_error_margin = -1.0; }; } // namespace rocksdb