Improve CPU Efficiency of ApproximateSize (part 2) (#5609)

Summary:
In some cases, we don't have to get really accurate number. Something like 10% off is fine, we can create a new option for that use case. In this case, we can calculate size for full files first, and avoid estimation inside SST files if full files got us a huge number. For example, if we already covered 100GB of data, we should be able to skip partial dives into 10 SST files of 30MB.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5609

Differential Revision: D16433481

Pulled By: elipoz

fbshipit-source-id: 5830b31e1c656d0fd3a00d7fd2678ddc8f6e601b
main
Eli Pozniansky 6 years ago committed by Facebook Github Bot
parent b538e756c2
commit 4834dab578
  1. 1
      HISTORY.md
  2. 3
      db/compaction/compaction_job.cc
  3. 4
      db/db_impl/db_impl.cc
  4. 96
      db/db_test.cc
  5. 161
      db/version_set.cc
  6. 8
      db/version_set.h
  7. 10
      include/rocksdb/options.h

@ -22,6 +22,7 @@
* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. * Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. * Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
### Performance Improvements ### Performance Improvements
* Reduce iterator key comparision for upper/lower bound check. * Reduce iterator key comparision for upper/lower bound check.

@ -520,7 +520,8 @@ void CompactionJob::GenSubcompactionBoundaries() {
// to the index block and may incur I/O cost in the process. Unlock db // to the index block and may incur I/O cost in the process. Unlock db
// mutex to reduce contention // mutex to reduce contention
db_mutex_->Unlock(); db_mutex_->Unlock();
uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1, uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
b, start_lvl, out_lvl + 1,
TableReaderCaller::kCompaction); TableReaderCaller::kCompaction);
db_mutex_->Lock(); db_mutex_->Lock();
ranges.emplace_back(a, b, size); ranges.emplace_back(a, b, size);

@ -2808,8 +2808,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
sizes[i] = 0; sizes[i] = 0;
if (options.include_files) { if (options.include_files) {
sizes[i] += versions_->ApproximateSize( sizes[i] += versions_->ApproximateSize(
v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
TableReaderCaller::kUserApproximateSize); /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
} }
if (options.include_memtabtles) { if (options.include_memtabtles) {
sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;

@ -1257,6 +1257,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
options.compression = kNoCompression; options.compression = kNoCompression;
options.create_if_missing = true; options.create_if_missing = true;
DestroyAndReopen(options); DestroyAndReopen(options);
auto default_cf = db_->DefaultColumnFamily();
const int N = 128; const int N = 128;
Random rnd(301); Random rnd(301);
@ -1268,9 +1269,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
std::string start = Key(50); std::string start = Key(50);
std::string end = Key(60); std::string end = Key(60);
Range r(start, end); Range r(start, end);
uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES | SizeApproximationOptions size_approx_options;
DB::SizeApproximationFlags::INCLUDE_MEMTABLES; size_approx_options.include_memtabtles = true;
db_->GetApproximateSizes(&r, 1, &size, include_both); size_approx_options.include_files = true;
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_GT(size, 6000); ASSERT_GT(size, 6000);
ASSERT_LT(size, 204800); ASSERT_LT(size, 204800);
// Zero if not including mem table // Zero if not including mem table
@ -1280,7 +1282,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
start = Key(500); start = Key(500);
end = Key(600); end = Key(600);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_EQ(size, 0); ASSERT_EQ(size, 0);
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
@ -1290,19 +1292,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
start = Key(500); start = Key(500);
end = Key(600); end = Key(600);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_EQ(size, 0); ASSERT_EQ(size, 0);
start = Key(100); start = Key(100);
end = Key(1020); end = Key(1020);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_GT(size, 6000); ASSERT_GT(size, 6000);
options.max_write_buffer_number = 8; options.max_write_buffer_number = 8;
options.min_write_buffer_number_to_merge = 5; options.min_write_buffer_number_to_merge = 5;
options.write_buffer_size = 1024 * N; // Not very large options.write_buffer_size = 1024 * N; // Not very large
DestroyAndReopen(options); DestroyAndReopen(options);
default_cf = db_->DefaultColumnFamily();
int keys[N * 3]; int keys[N * 3];
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
@ -1319,26 +1322,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
start = Key(100); start = Key(100);
end = Key(300); end = Key(300);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_EQ(size, 0); ASSERT_EQ(size, 0);
start = Key(1050); start = Key(1050);
end = Key(1080); end = Key(1080);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_GT(size, 6000); ASSERT_GT(size, 6000);
start = Key(2100); start = Key(2100);
end = Key(2300); end = Key(2300);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_EQ(size, 0); ASSERT_EQ(size, 0);
start = Key(1050); start = Key(1050);
end = Key(1080); end = Key(1080);
r = Range(start, end); r = Range(start, end);
uint64_t size_with_mt, size_without_mt; uint64_t size_with_mt, size_without_mt;
db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
&size_with_mt);
ASSERT_GT(size_with_mt, 6000); ASSERT_GT(size_with_mt, 6000);
db_->GetApproximateSizes(&r, 1, &size_without_mt); db_->GetApproximateSizes(&r, 1, &size_without_mt);
ASSERT_EQ(size_without_mt, 0); ASSERT_EQ(size_without_mt, 0);
@ -1352,10 +1356,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
start = Key(1050); start = Key(1050);
end = Key(1080); end = Key(1080);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
&size_with_mt);
db_->GetApproximateSizes(&r, 1, &size_without_mt); db_->GetApproximateSizes(&r, 1, &size_without_mt);
ASSERT_GT(size_with_mt, size_without_mt); ASSERT_GT(size_with_mt, size_without_mt);
ASSERT_GT(size_without_mt, 6000); ASSERT_GT(size_without_mt, 6000);
// Check that include_memtabtles flag works as expected
size_approx_options.include_memtabtles = false;
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_EQ(size, size_without_mt);
// Check that files_size_error_margin works as expected, when the heuristic
// conditions are not met
start = Key(1);
end = Key(1000 + N - 2);
r = Range(start, end);
size_approx_options.files_size_error_margin = -1.0; // disabled
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
uint64_t size2;
size_approx_options.files_size_error_margin = 0.5; // enabled, but not used
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
ASSERT_EQ(size, size2);
}
TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
Options options = CurrentOptions();
options.write_buffer_size = 1024 * 1024;
options.compression = kNoCompression;
options.create_if_missing = true;
options.target_file_size_base = 1024 * 1024;
DestroyAndReopen(options);
const auto default_cf = db_->DefaultColumnFamily();
const int N = 64000;
Random rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
}
// Flush everything to files
Flush();
// Compact the entire key space into the next level
db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
// Write more keys
for (int i = N; i < (N + N / 4); i++) {
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
}
// Flush everything to files again
Flush();
// Wait for compaction to finish
ASSERT_OK(dbfull()->TEST_WaitForCompact());
const std::string start = Key(0);
const std::string end = Key(2 * N);
const Range r(start, end);
SizeApproximationOptions size_approx_options;
size_approx_options.include_memtabtles = false;
size_approx_options.include_files = true;
size_approx_options.files_size_error_margin = -1.0; // disabled
// Get the precise size without any approximation heuristic
uint64_t size;
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
ASSERT_NE(size, 0);
// Get the size with an approximation heuristic
uint64_t size2;
const double error_margin = 0.2;
size_approx_options.files_size_error_margin = error_margin;
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
ASSERT_LT(size2, size * (1 + error_margin));
ASSERT_GT(size2, size * (1 - error_margin));
} }
TEST_F(DBTest, GetApproximateMemTableStats) { TEST_F(DBTest, GetApproximateMemTableStats) {

@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
// we avoid doing binary search for the keys b and c twice and instead somehow // we avoid doing binary search for the keys b and c twice and instead somehow
// maintain state of where they first appear in the files. // maintain state of where they first appear in the files.
uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
Version* v, const Slice& start,
const Slice& end, int start_level, const Slice& end, int start_level,
int end_level, TableReaderCaller caller) { int end_level, TableReaderCaller caller) {
const auto& icmp = v->cfd_->internal_comparator();
// pre-condition // pre-condition
assert(v->cfd_->internal_comparator().Compare(start, end) <= 0); assert(icmp.Compare(start, end) <= 0);
uint64_t size = 0; uint64_t total_full_size = 0;
const auto* vstorage = v->storage_info(); const auto* vstorage = v->storage_info();
end_level = end_level == -1 const int num_non_empty_levels = vstorage->num_non_empty_levels();
? vstorage->num_non_empty_levels() end_level = (end_level == -1) ? num_non_empty_levels
: std::min(end_level, vstorage->num_non_empty_levels()); : std::min(end_level, num_non_empty_levels);
assert(start_level <= end_level); assert(start_level <= end_level);
for (int level = start_level; level < end_level; level++) { // Outline of the optimization that uses options.files_size_error_margin.
// When approximating the files total size that is used to store a keys range,
// we first sum up the sizes of the files that fully fall into the range.
// Then we sum up the sizes of all the files that may intersect with the range
// (this includes all files in L0 as well). Then, if total_intersecting_size
// is smaller than total_full_size * options.files_size_error_margin - we can
// infer that the intersecting files have a sufficiently negligible
// contribution to the total size, and we can approximate the storage required
// for the keys in range as just half of the intersecting_files_size.
// E.g., if the value of files_size_error_margin is 0.1, then the error of the
// approximation is limited to only ~10% of the total size of files that fully
// fall into the keys range. In such case, this helps to avoid a costly
// process of binary searching the intersecting files that is required only
// for a more precise calculation of the total size.
autovector<FdWithKeyRange*, 32> first_files;
autovector<FdWithKeyRange*, 16> last_files;
// scan all the levels
for (int level = start_level; level < end_level; ++level) {
const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
if (!files_brief.num_files) { if (files_brief.num_files == 0) {
// empty level, skip exploration // empty level, skip exploration
continue; continue;
} }
if (!level) { if (level == 0) {
// level 0 data is sorted order, handle the use case explicitly // level 0 files are not in sorted order, we need to iterate through
size += ApproximateSizeLevel0(v, files_brief, start, end, caller); // the list to compute the total bytes that require scanning,
// so handle the case explicitly (similarly to first_files case)
for (size_t i = 0; i < files_brief.num_files; i++) {
first_files.push_back(&files_brief.files[i]);
}
continue; continue;
} }
assert(level > 0); assert(level > 0);
assert(files_brief.num_files > 0); assert(files_brief.num_files > 0);
// identify the file position for starting key // identify the file position for start key
const uint64_t idx_start = FindFileInRange( const int idx_start =
v->cfd_->internal_comparator(), files_brief, start, FindFileInRange(icmp, files_brief, start, 0,
/*start=*/0, static_cast<uint32_t>(files_brief.num_files - 1)); static_cast<uint32_t>(files_brief.num_files - 1));
assert(idx_start < files_brief.num_files); assert(static_cast<size_t>(idx_start) < files_brief.num_files);
// scan all files from the starting position until the ending position
// inferred from the sorted order
for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
uint64_t val;
val = ApproximateSize(v, files_brief.files[i], end, caller);
if (!val) {
// the files after this will not have the range
break;
}
size += val; // identify the file position for end key
int idx_end = idx_start;
if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
idx_end =
FindFileInRange(icmp, files_brief, end, idx_start,
static_cast<uint32_t>(files_brief.num_files - 1));
}
assert(idx_end >= idx_start &&
static_cast<size_t>(idx_end) < files_brief.num_files);
if (i == idx_start) { // scan all files from the starting index to the ending index
// subtract the bytes needed to be scanned to get to the starting // (inferred from the sorted order)
// key
val = ApproximateSize(v, files_brief.files[i], start, caller); // first scan all the intermediate full files (excluding first and last)
assert(size >= val); for (int i = idx_start + 1; i < idx_end; ++i) {
size -= val; uint64_t file_size = files_brief.files[i].fd.GetFileSize();
} // The entire file falls into the range, so we can just take its size.
assert(file_size ==
ApproximateSize(v, files_brief.files[i], end, caller));
total_full_size += file_size;
}
// save the first and the last files (which may be the same file), so we
// can scan them later.
first_files.push_back(&files_brief.files[idx_start]);
if (idx_start != idx_end) {
// we need to estimate size for both files, only if they are different
last_files.push_back(&files_brief.files[idx_end]);
} }
} }
return size; // The sum of all file sizes that intersect the [start, end] keys range.
} uint64_t total_intersecting_size = 0;
for (const auto* file_ptr : first_files) {
total_intersecting_size += file_ptr->fd.GetFileSize();
}
for (const auto* file_ptr : last_files) {
total_intersecting_size += file_ptr->fd.GetFileSize();
}
uint64_t VersionSet::ApproximateSizeLevel0(Version* v, // Now scan all the first & last files at each level, and estimate their size.
const LevelFilesBrief& files_brief, // If the total_intersecting_size is less than X% of the total_full_size - we
const Slice& key_start, // want to approximate the result in order to avoid the costly binary search
const Slice& key_end, // inside ApproximateSize. We use half of file size as an approximation below.
TableReaderCaller caller) {
// level 0 files are not in sorted order, we need to iterate through const double margin = options.files_size_error_margin;
// the list to compute the total bytes that require scanning if (margin > 0 && total_intersecting_size <
uint64_t size = 0; static_cast<uint64_t>(total_full_size * margin)) {
for (size_t i = 0; i < files_brief.num_files; i++) { total_full_size += total_intersecting_size / 2;
const uint64_t start = } else {
ApproximateSize(v, files_brief.files[i], key_start, caller); // Estimate for all the first files, at each level
const uint64_t end = for (const auto file_ptr : first_files) {
ApproximateSize(v, files_brief.files[i], key_end, caller); total_full_size += ApproximateSize(v, *file_ptr, end, caller);
assert(end >= start); // subtract the bytes needed to be scanned to get to the starting key
size += end - start; uint64_t val = ApproximateSize(v, *file_ptr, start, caller);
assert(total_full_size >= val);
total_full_size -= val;
}
// Estimate for all the last files, at each level
for (const auto file_ptr : last_files) {
total_full_size += ApproximateSize(v, *file_ptr, end, caller);
}
} }
return size;
return total_full_size;
} }
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
TableReaderCaller caller) { TableReaderCaller caller) {
// pre-condition // pre-condition
assert(v); assert(v);
const auto& icmp = v->cfd_->internal_comparator();
uint64_t result = 0; uint64_t result = 0;
if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) { if (icmp.Compare(f.largest_key, key) <= 0) {
// Entire file is before "key", so just add the file size // Entire file is before "key", so just add the file size
result = f.fd.GetFileSize(); result = f.fd.GetFileSize();
} else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) { } else if (icmp.Compare(f.smallest_key, key) > 0) {
// Entire file is after "key", so ignore // Entire file is after "key", so ignore
result = 0; result = 0;
} else { } else {
@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
TableCache* table_cache = v->cfd_->table_cache(); TableCache* table_cache = v->cfd_->table_cache();
if (table_cache != nullptr) { if (table_cache != nullptr) {
result = table_cache->ApproximateOffsetOf( result = table_cache->ApproximateOffsetOf(
key, f.file_metadata->fd, caller, v->cfd()->internal_comparator(), key, f.file_metadata->fd, caller, icmp,
v->GetMutableCFOptions().prefix_extractor.get()); v->GetMutableCFOptions().prefix_extractor.get());
} }
} }

@ -983,7 +983,8 @@ class VersionSet {
// Return the approximate size of data to be scanned for range [start, end) // Return the approximate size of data to be scanned for range [start, end)
// in levels [start_level, end_level). If end_level == -1 it will search // in levels [start_level, end_level). If end_level == -1 it will search
// through all non-empty levels // through all non-empty levels
uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
const Slice& start, const Slice& end,
int start_level, int end_level, int start_level, int end_level,
TableReaderCaller caller); TableReaderCaller caller);
@ -1033,11 +1034,6 @@ class VersionSet {
} }
}; };
// ApproximateSize helper
uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
const Slice& start, const Slice& end,
TableReaderCaller caller);
uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
const Slice& key, TableReaderCaller caller); const Slice& key, TableReaderCaller caller);

@ -1514,6 +1514,16 @@ struct SizeApproximationOptions {
// Defines whether the returned size should include data serialized to disk. // Defines whether the returned size should include data serialized to disk.
// If set to false, include_memtabtles must be true. // If set to false, include_memtabtles must be true.
bool include_files = true; bool include_files = true;
// When approximating the files total size that is used to store a keys range
// using DB::GetApproximateSizes, allow approximation with an error margin of
// up to total_files_size * files_size_error_margin. This allows to take some
// shortcuts in files size approximation, resulting in better performance,
// while guaranteeing the resulting error is within a reasonable margin.
// E.g., if the value is 0.1, then the error margin of the returned files size
// approximation will be within 10%.
// If the value is non-positive - a more precise yet more CPU intensive
// estimation is performed.
double files_size_error_margin = -1.0;
}; };
} // namespace rocksdb } // namespace rocksdb

Loading…
Cancel
Save