Avoid doing a exhaustive search when looking for overlapping files.

Summary:
The Version::GetOverlappingInputs() is called multiple times in
the compaction code path. Eack invocation does a binary search
for overlapping files in the specified key range.
This patch remembers the offset of an overlapped file when
GetOverlappingInputs() is called the first time within
a compaction run. Suceeding calls to GetOverlappingInputs()
uses the remembered index to avoid the binary search.

I measured that 1000 iterations of GetOverlappingInputs
takes around 4500 microseconds without this patch. If I use
this patch with the hint on every invocation, then 1000
iterations take about 3900 microsecond.

Test Plan: make check OPT=-g

Reviewers: heyongqiang

Reviewed By: heyongqiang

CC: MarkCallaghan, emayanke, sheki

Differential Revision: https://reviews.facebook.net/D6513
main
Dhruba Borthakur 12 years ago
parent 4e413df3d0
commit 9b87a2bae8
  1. 65
      db/version_set.cc
  2. 10
      db/version_set.h

@ -453,11 +453,16 @@ int Version::PickLevelForMemTableOutput(
} }
// Store in "*inputs" all files in "level" that overlap [begin,end] // Store in "*inputs" all files in "level" that overlap [begin,end]
// If hint_index is specified, then it points to a file in the
// overlapping range.
// The file_index returns a pointer to any file in an overlapping range.
void Version::GetOverlappingInputs( void Version::GetOverlappingInputs(
int level, int level,
const InternalKey* begin, const InternalKey* begin,
const InternalKey* end, const InternalKey* end,
std::vector<FileMetaData*>* inputs) { std::vector<FileMetaData*>* inputs,
int hint_index,
int* file_index) {
inputs->clear(); inputs->clear();
Slice user_begin, user_end; Slice user_begin, user_end;
if (begin != NULL) { if (begin != NULL) {
@ -468,7 +473,8 @@ void Version::GetOverlappingInputs(
} }
const Comparator* user_cmp = vset_->icmp_.user_comparator(); const Comparator* user_cmp = vset_->icmp_.user_comparator();
if (begin != NULL && end != NULL && level > 0) { if (begin != NULL && end != NULL && level > 0) {
GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs); GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
hint_index, file_index);
return; return;
} }
for (size_t i = 0; i < files_[level].size(); ) { for (size_t i = 0; i < files_[level].size(); ) {
@ -493,6 +499,8 @@ void Version::GetOverlappingInputs(
inputs->clear(); inputs->clear();
i = 0; i = 0;
} }
} else if (file_index) {
*file_index = i;
} }
} }
} }
@ -506,14 +514,24 @@ void Version::GetOverlappingInputsBinarySearch(
int level, int level,
const Slice& user_begin, const Slice& user_begin,
const Slice& user_end, const Slice& user_end,
std::vector<FileMetaData*>* inputs) { std::vector<FileMetaData*>* inputs,
int hint_index,
int* file_index) {
assert(level > 0); assert(level > 0);
int min = 0; int min = 0;
int mid = 0; int mid = 0;
int max = files_[level].size() -1; int max = files_[level].size() -1;
bool foundOverlap = false; bool foundOverlap = false;
const Comparator* user_cmp = vset_->icmp_.user_comparator(); const Comparator* user_cmp = vset_->icmp_.user_comparator();
while (min <= max) {
// if the caller already knows the index of a file that has overlap,
// then we can skip the binary search.
if (hint_index != -1) {
mid = hint_index;
foundOverlap = true;
}
while (!foundOverlap && min <= max) {
mid = (min + max)/2; mid = (min + max)/2;
FileMetaData* f = files_[level][mid]; FileMetaData* f = files_[level][mid];
const Slice file_start = f->smallest.user_key(); const Slice file_start = f->smallest.user_key();
@ -532,6 +550,10 @@ void Version::GetOverlappingInputsBinarySearch(
if (!foundOverlap) { if (!foundOverlap) {
return; return;
} }
// returns the index where an overlap is found
if (file_index) {
*file_index = mid;
}
ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid); ExtendOverlappingInputs(level, user_begin, user_end, inputs, mid);
} }
@ -546,13 +568,21 @@ void Version::ExtendOverlappingInputs(
std::vector<FileMetaData*>* inputs, std::vector<FileMetaData*>* inputs,
int midIndex) { int midIndex) {
// assert that the file at midIndex overlaps with the range
const Comparator* user_cmp = vset_->icmp_.user_comparator(); const Comparator* user_cmp = vset_->icmp_.user_comparator();
assert(midIndex < files_[level].size()); #ifndef NDEBUG
assert((user_cmp->Compare(files_[level][midIndex]->largest.user_key(), {
user_begin) >= 0) || // assert that the file at midIndex overlaps with the range
(user_cmp->Compare(files_[level][midIndex]->smallest.user_key(), assert(midIndex < files_[level].size());
user_end) <= 0)); FileMetaData* f = files_[level][midIndex];
const Slice fstart = f->smallest.user_key();
const Slice flimit = f->largest.user_key();
if (user_cmp->Compare(fstart, user_begin) >= 0) {
assert(user_cmp->Compare(fstart, user_end) <= 0);
} else {
assert(user_cmp->Compare(flimit, user_begin) >= 0);
}
}
#endif
// check backwards from 'mid' to lower indices // check backwards from 'mid' to lower indices
for (size_t i = midIndex; i < files_[level].size(); i--) { for (size_t i = midIndex; i < files_[level].size(); i--) {
@ -1487,12 +1517,14 @@ Compaction* VersionSet::PickCompaction() {
if (compact_pointer_[level].empty() || if (compact_pointer_[level].empty() ||
icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
c->inputs_[0].push_back(f); c->inputs_[0].push_back(f);
c->base_index_ = i;
break; break;
} }
} }
if (c->inputs_[0].empty()) { if (c->inputs_[0].empty()) {
// Wrap-around to the beginning of the key space // Wrap-around to the beginning of the key space
c->inputs_[0].push_back(current_->files_[level][0]); c->inputs_[0].push_back(current_->files_[level][0]);
c->base_index_ = 0;
} }
} else if (seek_compaction) { } else if (seek_compaction) {
level = current_->file_to_compact_level_; level = current_->file_to_compact_level_;
@ -1527,7 +1559,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
InternalKey smallest, largest; InternalKey smallest, largest;
GetRange(c->inputs_[0], &smallest, &largest); GetRange(c->inputs_[0], &smallest, &largest);
current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1],
c->parent_index_, &c->parent_index_);
// Get entire range covered by compaction // Get entire range covered by compaction
InternalKey all_start, all_limit; InternalKey all_start, all_limit;
@ -1537,7 +1570,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
// changing the number of "level+1" files we pick up. // changing the number of "level+1" files we pick up.
if (!c->inputs_[1].empty()) { if (!c->inputs_[1].empty()) {
std::vector<FileMetaData*> expanded0; std::vector<FileMetaData*> expanded0;
current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0,
c->base_index_, NULL);
const int64_t inputs0_size = TotalFileSize(c->inputs_[0]); const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
const int64_t inputs1_size = TotalFileSize(c->inputs_[1]); const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
const int64_t expanded0_size = TotalFileSize(expanded0); const int64_t expanded0_size = TotalFileSize(expanded0);
@ -1548,7 +1582,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
GetRange(expanded0, &new_start, &new_limit); GetRange(expanded0, &new_start, &new_limit);
std::vector<FileMetaData*> expanded1; std::vector<FileMetaData*> expanded1;
current_->GetOverlappingInputs(level+1, &new_start, &new_limit, current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
&expanded1); &expanded1, c->parent_index_,
&c->parent_index_);
if (expanded1.size() == c->inputs_[1].size()) { if (expanded1.size() == c->inputs_[1].size()) {
Log(options_->info_log, Log(options_->info_log,
"Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
@ -1632,7 +1667,9 @@ Compaction::Compaction(int level, uint64_t target_file_size,
seek_compaction_(seek_compaction), seek_compaction_(seek_compaction),
grandparent_index_(0), grandparent_index_(0),
seen_key_(false), seen_key_(false),
overlapped_bytes_(0) { overlapped_bytes_(0),
base_index_(-1),
parent_index_(-1) {
edit_ = new VersionEdit(number_levels_); edit_ = new VersionEdit(number_levels_);
level_ptrs_ = new size_t[number_levels_]; level_ptrs_ = new size_t[number_levels_];
for (int i = 0; i < number_levels_; i++) { for (int i = 0; i < number_levels_; i++) {

@ -87,13 +87,17 @@ class Version {
int level, int level,
const InternalKey* begin, // NULL means before all keys const InternalKey* begin, // NULL means before all keys
const InternalKey* end, // NULL means after all keys const InternalKey* end, // NULL means after all keys
std::vector<FileMetaData*>* inputs); std::vector<FileMetaData*>* inputs,
int hint_index = -1, // index of overlap file
int* file_index = NULL); // return index of overlap file
void GetOverlappingInputsBinarySearch( void GetOverlappingInputsBinarySearch(
int level, int level,
const Slice& begin, // NULL means before all keys const Slice& begin, // NULL means before all keys
const Slice& end, // NULL means after all keys const Slice& end, // NULL means after all keys
std::vector<FileMetaData*>* inputs); std::vector<FileMetaData*>* inputs,
int hint_index, // index of overlap file
int* file_index); // return index of overlap file
void ExtendOverlappingInputs( void ExtendOverlappingInputs(
int level, int level,
@ -430,6 +434,8 @@ class Compaction {
bool seen_key_; // Some output key has been seen bool seen_key_; // Some output key has been seen
int64_t overlapped_bytes_; // Bytes of overlap between current output int64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files // and grandparent files
int base_index_; // index of the file in files_[level_]
int parent_index_; // index of some file with same range in files_[level_+1]
// State for implementing IsBaseLevelForKey // State for implementing IsBaseLevelForKey

Loading…
Cancel
Save