Optimize GetApproximateSizes() to use lesser CPU cycles.

Summary:
CPU profiling reveals GetApproximateSizes as a bottleneck for performance. The current implementation is sub-optimal, it scans every file in every level to compute the result.

We can take advantage of the fact that all levels above 0 are sorted in the increasing order of key ranges and use binary search to locate the starting index. This can reduce the number of comparisons required to compute the result.

Test Plan: We have good test coverage. Run the tests.

Reviewers: sdong, igor, rven, dynamike

Subscribers: dynamike, maykov, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D37755
main
krad 9 years ago
parent fd96b55402
commit d4540654e9
  1. 10
      db/db_impl.cc
  2. 119
      db/version_set.cc
  3. 12
      db/version_set.h

@ -3587,7 +3587,6 @@ void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* range, int n, uint64_t* sizes) { const Range* range, int n, uint64_t* sizes) {
// TODO(opt): better implementation
Version* v; Version* v;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd(); auto cfd = cfh->cfd();
@ -3599,12 +3598,9 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
// Convert user_key into a corresponding internal key. // Convert user_key into a corresponding internal key.
InternalKey k1, k2; InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
k1.SetMaxPossibleForUserKey(range[i].start); InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
k2.SetMaxPossibleForUserKey(range[i].limit); sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
uint64_t start = versions_->ApproximateOffsetOf(v, k1);
uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
sizes[i] = (limit >= start ? limit - start : 0);
} }
{ {

@ -2802,40 +2802,101 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
return result; return result;
} }
uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
const Slice& end) {
// pre-condition
assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t size = 0;
uint64_t result = 0;
const auto* vstorage = v->storage_info(); const auto* vstorage = v->storage_info();
for (int level = 0; level < vstorage->num_levels(); level++) {
const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level); for (int level = 0; level < vstorage->num_non_empty_levels(); level++) {
for (size_t i = 0; i < files.size(); i++) { const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <= if (!files_brief.num_files) {
0) { // empty level, skip exploration
// Entire file is before "ikey", so just add the file size continue;
result += files[i]->fd.GetFileSize(); }
} else if (v->cfd_->internal_comparator().Compare(files[i]->smallest,
ikey) > 0) { if (!level) {
// Entire file is after "ikey", so ignore // level 0 data is sorted order, handle the use case explicitly
if (level > 0) { size += ApproximateSizeLevel0(v, files_brief, start, end);
// Files other than level 0 are sorted by meta->smallest, so continue;
// no further files in this level will contain data for }
// "ikey".
break; assert(level > 0);
} assert(files_brief.num_files > 0);
} else {
// "ikey" falls in the range for this table. Add the // identify the file position for starting key
// approximate offset of "ikey" within the table. const uint64_t idx_start =
TableReader* table_reader_ptr; FindFileInRange(v->cfd_->internal_comparator(), files_brief, start,
Iterator* iter = v->cfd_->table_cache()->NewIterator( /*start=*/0, files_brief.num_files - 1);
ReadOptions(), env_options_, v->cfd_->internal_comparator(), assert(idx_start < files_brief.num_files);
files[i]->fd, &table_reader_ptr);
if (table_reader_ptr != nullptr) { // scan all files from the starting position until the ending position
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); // inferred from the sorted order
} for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
delete iter; uint64_t val;
val = ApproximateSize(v, files_brief.files[i], end);
if (!val) {
// the files after this will not have the range
break;
}
size += val;
if (i == idx_start) {
// subtract the bytes needed to be scanned to get to the starting
// key
val = ApproximateSize(v, files_brief.files[i], start);
assert(size >= val);
size -= val;
} }
} }
} }
return size;
}
uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
const LevelFilesBrief& files_brief,
const Slice& key_start,
const Slice& key_end) {
// level 0 files are not in sorted order, we need to iterate through
// the list to compute the total bytes that require scanning
uint64_t size = 0;
for (size_t i = 0; i < files_brief.num_files; i++) {
const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start);
const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end);
assert(end >= start);
size += end - start;
}
return size;
}
uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
const Slice& key) {
// pre-condition
assert(v);
uint64_t result = 0;
if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
// Entire file is before "key", so just add the file size
result = f.fd.GetFileSize();
} else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
// Entire file is after "key", so ignore
result = 0;
} else {
// "key" falls in the range for this table. Add the
// approximate offset of "key" within the table.
TableReader* table_reader_ptr;
Iterator* iter = v->cfd_->table_cache()->NewIterator(
ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
&table_reader_ptr);
if (table_reader_ptr != nullptr) {
result = table_reader_ptr->ApproximateOffsetOf(key);
}
delete iter;
}
return result; return result;
} }

@ -618,9 +618,8 @@ class VersionSet {
// Add all files listed in any live version to *live. // Add all files listed in any live version to *live.
void AddLiveFiles(std::vector<FileDescriptor>* live_list); void AddLiveFiles(std::vector<FileDescriptor>* live_list);
// Return the approximate offset in the database of the data for // Return the approximate size of data to be scanned for range [start, end)
// "key" as of version "v". uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end);
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return the size of the current manifest file // Return the size of the current manifest file
uint64_t manifest_file_size() const { return manifest_file_size_; } uint64_t manifest_file_size() const { return manifest_file_size_; }
@ -657,6 +656,13 @@ class VersionSet {
} }
}; };
// ApproximateSize helper
uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
const Slice& start, const Slice& end);
uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
const Slice& key);
// Save current contents to *log // Save current contents to *log
Status WriteSnapshot(log::Writer* log); Status WriteSnapshot(log::Writer* log);

Loading…
Cancel
Save