Eliminate memcpy in Iterator::Prev() by pinning blocks for keys spanning multiple blocks

Summary:
This diff is stacked on top of this diff https://reviews.facebook.net/D56493
The current Iterator::Prev() implementation need to copy every value since the underlying Iterator may move after reading the value.
This can be optimized by making sure that the block containing the value is pinned until the Iterator move. which will improve the throughput by up to 1.5X

master
```
==> 1000000_Keys_100Byte.txt <==
readreverse  :       0.449 micros/op 2225887 ops/sec;  246.2 MB/s
readreverse  :       0.433 micros/op 2311508 ops/sec;  255.7 MB/s
readreverse  :       0.436 micros/op 2294335 ops/sec;  253.8 MB/s
readreverse  :       0.471 micros/op 2121295 ops/sec;  234.7 MB/s
readreverse  :       0.465 micros/op 2152227 ops/sec;  238.1 MB/s
readreverse  :       0.454 micros/op 2203011 ops/sec;  243.7 MB/s
readreverse  :       0.451 micros/op 2216095 ops/sec;  245.2 MB/s
readreverse  :       0.462 micros/op 2162447 ops/sec;  239.2 MB/s
readreverse  :       0.476 micros/op 2099151 ops/sec;  232.2 MB/s
readreverse  :       0.472 micros/op 2120710 ops/sec;  234.6 MB/s

avg : 242.34 MB/s

==> 1000000_Keys_1KB.txt <==
readreverse  :       1.013 micros/op 986793 ops/sec;  978.7 MB/s
readreverse  :       0.942 micros/op 1061136 ops/sec; 1052.5 MB/s
readreverse  :       0.951 micros/op 1051901 ops/sec; 1043.3 MB/s
readreverse  :       0.932 micros/op 1072894 ops/sec; 1064.1 MB/s
readreverse  :       1.024 micros/op 976720 ops/sec;  968.7 MB/s
readreverse  :       0.935 micros/op 1069169 ops/sec; 1060.4 MB/s
readreverse  :       1.012 micros/op 988132 ops/sec;  980.1 MB/s
readreverse  :       0.962 micros/op 1039579 ops/sec; 1031.1 MB/s
readreverse  :       0.991 micros/op 1008924 ops/sec; 1000.7 MB/s
readreverse  :       1.004 micros/op 996144 ops/sec;  988.0 MB/s

avg : 1016.76 MB/s

==> 1000000_Keys_10KB.txt <==
readreverse  :       4.167 micros/op 239952 ops/sec; 2346.9 MB/s
readreverse  :       4.070 micros/op 245713 ops/sec; 2403.3 MB/s
readreverse  :       4.572 micros/op 218733 ops/sec; 2139.4 MB/s
readreverse  :       4.497 micros/op 222388 ops/sec; 2175.2 MB/s
readreverse  :       4.203 micros/op 237920 ops/sec; 2327.1 MB/s
readreverse  :       4.206 micros/op 237756 ops/sec; 2325.5 MB/s
readreverse  :       4.181 micros/op 239149 ops/sec; 2339.1 MB/s
readreverse  :       4.157 micros/op 240552 ops/sec; 2352.8 MB/s
readreverse  :       4.187 micros/op 238848 ops/sec; 2336.1 MB/s
readreverse  :       4.106 micros/op 243575 ops/sec; 2382.4 MB/s

avg : 2312.78 MB/s

==> 100000_Keys_100KB.txt <==
readreverse  :      41.281 micros/op 24224 ops/sec; 2366.0 MB/s
readreverse  :      39.722 micros/op 25175 ops/sec; 2458.9 MB/s
readreverse  :      40.319 micros/op 24802 ops/sec; 2422.5 MB/s
readreverse  :      39.762 micros/op 25149 ops/sec; 2456.4 MB/s
readreverse  :      40.916 micros/op 24440 ops/sec; 2387.1 MB/s
readreverse  :      41.188 micros/op 24278 ops/sec; 2371.4 MB/s
readreverse  :      40.061 micros/op 24962 ops/sec; 2438.1 MB/s
readreverse  :      40.221 micros/op 24862 ops/sec; 2428.4 MB/s
readreverse  :      40.084 micros/op 24947 ops/sec; 2436.7 MB/s
readreverse  :      40.655 micros/op 24597 ops/sec; 2402.4 MB/s

avg : 2416.79 MB/s

==> 10000_Keys_1MB.txt <==
readreverse  :     298.038 micros/op 3355 ops/sec; 3355.3 MB/s
readreverse  :     335.001 micros/op 2985 ops/sec; 2985.1 MB/s
readreverse  :     286.956 micros/op 3484 ops/sec; 3484.9 MB/s
readreverse  :     329.954 micros/op 3030 ops/sec; 3030.8 MB/s
readreverse  :     306.428 micros/op 3263 ops/sec; 3263.5 MB/s
readreverse  :     330.749 micros/op 3023 ops/sec; 3023.5 MB/s
readreverse  :     328.903 micros/op 3040 ops/sec; 3040.5 MB/s
readreverse  :     324.853 micros/op 3078 ops/sec; 3078.4 MB/s
readreverse  :     320.488 micros/op 3120 ops/sec; 3120.3 MB/s
readreverse  :     320.536 micros/op 3119 ops/sec; 3119.8 MB/s

avg : 3150.21 MB/s
```

After memcpy elimination
```

==> 1000000_Keys_100Byte.txt <==
readreverse  :       0.395 micros/op 2529890 ops/sec;  279.9 MB/s
readreverse  :       0.368 micros/op 2715922 ops/sec;  300.5 MB/s
readreverse  :       0.384 micros/op 2603929 ops/sec;  288.1 MB/s
readreverse  :       0.375 micros/op 2663286 ops/sec;  294.6 MB/s
readreverse  :       0.357 micros/op 2802180 ops/sec;  310.0 MB/s
readreverse  :       0.363 micros/op 2757684 ops/sec;  305.1 MB/s
readreverse  :       0.372 micros/op 2689603 ops/sec;  297.5 MB/s
readreverse  :       0.379 micros/op 2638599 ops/sec;  291.9 MB/s
readreverse  :       0.375 micros/op 2663803 ops/sec;  294.7 MB/s
readreverse  :       0.375 micros/op 2665579 ops/sec;  294.9 MB/s

avg: 295.72 MB/s (1.22 X)

==> 1000000_Keys_1KB.txt <==
readreverse  :       0.879 micros/op 1138112 ops/sec; 1128.8 MB/s
readreverse  :       0.842 micros/op 1187998 ops/sec; 1178.3 MB/s
readreverse  :       0.837 micros/op 1194915 ops/sec; 1185.1 MB/s
readreverse  :       0.845 micros/op 1182983 ops/sec; 1173.3 MB/s
readreverse  :       0.877 micros/op 1140308 ops/sec; 1131.0 MB/s
readreverse  :       0.849 micros/op 1177581 ops/sec; 1168.0 MB/s
readreverse  :       0.915 micros/op 1093284 ops/sec; 1084.3 MB/s
readreverse  :       0.863 micros/op 1159418 ops/sec; 1149.9 MB/s
readreverse  :       0.895 micros/op 1117670 ops/sec; 1108.5 MB/s
readreverse  :       0.852 micros/op 1174116 ops/sec; 1164.5 MB/s

avg: 1147.17 MB/s (1.12 X)

==> 1000000_Keys_10KB.txt <==
readreverse  :       3.870 micros/op 258386 ops/sec; 2527.2 MB/s
readreverse  :       3.568 micros/op 280296 ops/sec; 2741.5 MB/s
readreverse  :       4.005 micros/op 249694 ops/sec; 2442.2 MB/s
readreverse  :       3.550 micros/op 281719 ops/sec; 2755.5 MB/s
readreverse  :       3.562 micros/op 280758 ops/sec; 2746.1 MB/s
readreverse  :       3.507 micros/op 285125 ops/sec; 2788.8 MB/s
readreverse  :       3.463 micros/op 288739 ops/sec; 2824.1 MB/s
readreverse  :       3.428 micros/op 291734 ops/sec; 2853.4 MB/s
readreverse  :       3.553 micros/op 281491 ops/sec; 2753.2 MB/s
readreverse  :       3.535 micros/op 282885 ops/sec; 2766.9 MB/s

avg : 2719.89 MB/s (1.17 X)

==> 100000_Keys_100KB.txt <==
readreverse  :      22.815 micros/op 43830 ops/sec; 4281.0 MB/s
readreverse  :      29.957 micros/op 33381 ops/sec; 3260.4 MB/s
readreverse  :      25.334 micros/op 39473 ops/sec; 3855.4 MB/s
readreverse  :      23.037 micros/op 43409 ops/sec; 4239.8 MB/s
readreverse  :      27.810 micros/op 35958 ops/sec; 3512.1 MB/s
readreverse  :      30.327 micros/op 32973 ops/sec; 3220.6 MB/s
readreverse  :      29.704 micros/op 33665 ops/sec; 3288.2 MB/s
readreverse  :      29.423 micros/op 33987 ops/sec; 3319.6 MB/s
readreverse  :      23.334 micros/op 42856 ops/sec; 4185.9 MB/s
readreverse  :      29.969 micros/op 33368 ops/sec; 3259.1 MB/s

avg : 3642.21 MB/s (1.5 X)

==> 10000_Keys_1MB.txt <==
readreverse  :     244.748 micros/op 4085 ops/sec; 4085.9 MB/s
readreverse  :     230.208 micros/op 4343 ops/sec; 4344.0 MB/s
readreverse  :     235.655 micros/op 4243 ops/sec; 4243.6 MB/s
readreverse  :     235.730 micros/op 4242 ops/sec; 4242.2 MB/s
readreverse  :     237.346 micros/op 4213 ops/sec; 4213.3 MB/s
readreverse  :     227.306 micros/op 4399 ops/sec; 4399.4 MB/s
readreverse  :     194.957 micros/op 5129 ops/sec; 5129.4 MB/s
readreverse  :     238.359 micros/op 4195 ops/sec; 4195.4 MB/s
readreverse  :     221.588 micros/op 4512 ops/sec; 4513.0 MB/s
readreverse  :     235.911 micros/op 4238 ops/sec; 4239.0 MB/s

avg : 4360.52 MB/s (1.38 X)
```

Test Plan: COMPILE_WITH_ASAN=1 make check -j64

Reviewers: andrewkr, yhchiang, sdong

Reviewed By: sdong

Subscribers: andrewkr, dhruba

Differential Revision: https://reviews.facebook.net/D56511
main
Islam AbdelRahman 9 years ago
parent 1b166928c7
commit 6e801b0bd1
  1. 51
      db/db_iter.cc
  2. 158
      db/db_iterator_test.cc

@ -131,9 +131,8 @@ class DBIter: public Iterator {
}
}
virtual ~DBIter() {
if (pin_thru_lifetime_) {
pinned_iters_mgr_.ReleasePinnedIterators();
}
// Release pinned data if any
pinned_iters_mgr_.ReleasePinnedIterators();
RecordTick(statistics_, NO_ITERATORS, -1);
local_stats_.BumpGlobalStatistics(statistics_);
if (!arena_mode_) {
@ -154,8 +153,13 @@ class DBIter: public Iterator {
}
virtual Slice value() const override {
assert(valid_);
return (direction_ == kForward && !current_entry_is_merged_) ?
iter_->value() : saved_value_;
if (current_entry_is_merged_) {
return saved_value_;
} else if (direction_ == kReverse) {
return pinned_value_;
} else {
return iter_->value();
}
}
virtual Status status() const override {
if (status_.ok()) {
@ -206,6 +210,21 @@ class DBIter: public Iterator {
bool ParseKey(ParsedInternalKey* key);
void MergeValuesNewToOld();
// Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
// is called
void TempPinData() {
if (!pin_thru_lifetime_) {
pinned_iters_mgr_.StartPinning();
}
}
// Release blocks pinned by TempPinData()
void ReleaseTempPinnedData() {
if (!pin_thru_lifetime_) {
pinned_iters_mgr_.ReleasePinnedIterators();
}
}
inline void ClearSavedValue() {
if (saved_value_.capacity() > 1048576) {
std::string empty;
@ -227,6 +246,7 @@ class DBIter: public Iterator {
Status status_;
IterKey saved_key_;
std::string saved_value_;
Slice pinned_value_;
Direction direction_;
bool valid_;
bool current_entry_is_merged_;
@ -266,6 +286,8 @@ void DBIter::Next() {
assert(valid_);
if (direction_ == kReverse) {
// We only pin blocks when doing kReverse
ReleaseTempPinnedData();
FindNextUserKey();
direction_ = kForward;
if (!iter_->Valid()) {
@ -472,6 +494,7 @@ void DBIter::Prev() {
if (direction_ == kForward) {
ReverseToBackward();
}
ReleaseTempPinnedData();
PrevInternal();
if (statistics_ != nullptr) {
local_stats_.prev_count_++;
@ -555,6 +578,7 @@ void DBIter::PrevInternal() {
bool DBIter::FindValueForCurrentKey() {
assert(iter_->Valid());
merge_context_.Clear();
current_entry_is_merged_ = false;
// last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
// kTypeValue)
ValueType last_not_merge_type = kTypeDeletion;
@ -575,7 +599,9 @@ bool DBIter::FindValueForCurrentKey() {
switch (last_key_entry_type) {
case kTypeValue:
merge_context_.Clear();
saved_value_ = iter_->value().ToString();
ReleaseTempPinnedData();
TempPinData();
pinned_value_ = iter_->value();
last_not_merge_type = kTypeValue;
break;
case kTypeDeletion:
@ -605,6 +631,7 @@ bool DBIter::FindValueForCurrentKey() {
valid_ = false;
return false;
case kTypeMerge:
current_entry_is_merged_ = true;
if (last_not_merge_type == kTypeDeletion) {
StopWatchNano timer(env_, statistics_ != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos);
@ -615,12 +642,10 @@ bool DBIter::FindValueForCurrentKey() {
timer.ElapsedNanos());
} else {
assert(last_not_merge_type == kTypeValue);
std::string last_put_value = saved_value_;
Slice temp_slice(last_put_value);
{
StopWatchNano timer(env_, statistics_ != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos);
user_merge_operator_->FullMerge(saved_key_.GetKey(), &temp_slice,
user_merge_operator_->FullMerge(saved_key_.GetKey(), &pinned_value_,
merge_context_.GetOperands(),
&saved_value_, logger_);
RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
@ -655,7 +680,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
if (ikey.type == kTypeValue || ikey.type == kTypeDeletion ||
ikey.type == kTypeSingleDeletion) {
if (ikey.type == kTypeValue) {
saved_value_ = iter_->value().ToString();
ReleaseTempPinnedData();
TempPinData();
pinned_value_ = iter_->value();
valid_ = true;
return true;
}
@ -665,6 +692,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
// kTypeMerge. We need to collect all kTypeMerge values and save them
// in operands
current_entry_is_merged_ = true;
merge_context_.Clear();
while (iter_->Valid() &&
user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) &&
@ -767,6 +795,7 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
void DBIter::Seek(const Slice& target) {
StopWatch sw(env_, statistics_, DB_SEEK);
ReleaseTempPinnedData();
saved_key_.Clear();
// now savved_key is used to store internal key.
saved_key_.SetInternalKey(target, sequence_);
@ -809,6 +838,7 @@ void DBIter::SeekToFirst() {
max_skip_ = std::numeric_limits<uint64_t>::max();
}
direction_ = kForward;
ReleaseTempPinnedData();
ClearSavedValue();
{
@ -841,6 +871,7 @@ void DBIter::SeekToLast() {
max_skip_ = std::numeric_limits<uint64_t>::max();
}
direction_ = kReverse;
ReleaseTempPinnedData();
ClearSavedValue();
{

@ -1228,6 +1228,164 @@ TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
delete iter;
}
TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocks) {
Options options = CurrentOptions();
BlockBasedTableOptions table_options;
table_options.block_size = 1; // every block will contain one entry
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
options.disable_auto_compactions = true;
options.max_sequential_skip_in_iterations = 8;
DestroyAndReopen(options);
// Putting such deletes will force DBIter::Prev() to fallback to a Seek
for (int file_num = 0; file_num < 10; file_num++) {
ASSERT_OK(Delete("key4"));
ASSERT_OK(Flush());
}
// First File containing 5 blocks of puts
ASSERT_OK(Put("key1", "val1.0"));
ASSERT_OK(Put("key2", "val2.0"));
ASSERT_OK(Put("key3", "val3.0"));
ASSERT_OK(Put("key4", "val4.0"));
ASSERT_OK(Put("key5", "val5.0"));
ASSERT_OK(Flush());
// Second file containing 9 blocks of merge operands
ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
ASSERT_OK(Flush());
{
ReadOptions ro;
ro.fill_cache = false;
Iterator* iter = db_->NewIterator(ro);
iter->SeekToLast();
ASSERT_EQ(iter->key().ToString(), "key5");
ASSERT_EQ(iter->value().ToString(), "val5.0");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "key4");
ASSERT_EQ(iter->value().ToString(), "val4.0");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "key3");
ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "key2");
ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
iter->Prev();
ASSERT_EQ(iter->key().ToString(), "key1");
ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
delete iter;
}
}
TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
Options options = CurrentOptions();
options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
options.disable_auto_compactions = true;
options.level0_slowdown_writes_trigger = (1 << 30);
options.level0_stop_writes_trigger = (1 << 30);
options.max_sequential_skip_in_iterations = 8;
DestroyAndReopen(options);
const int kNumKeys = 500;
// Small number of merge operands to make sure that DBIter::Prev() dont
// fall back to Seek()
const int kNumMergeOperands = 3;
// Use value size that will make sure that every block contain 1 key
const int kValSize =
static_cast<int>(BlockBasedTableOptions().block_size) * 4;
// Percentage of keys that wont get merge operations
const int kNoMergeOpPercentage = 20;
// Percentage of keys that will be deleted
const int kDeletePercentage = 10;
// For half of the key range we will write multiple deletes first to
// force DBIter::Prev() to fall back to Seek()
for (int file_num = 0; file_num < 10; file_num++) {
for (int i = 0; i < kNumKeys; i += 2) {
ASSERT_OK(Delete(Key(i)));
}
ASSERT_OK(Flush());
}
Random rnd(301);
std::map<std::string, std::string> true_data;
std::string gen_key;
std::string gen_val;
for (int i = 0; i < kNumKeys; i++) {
gen_key = Key(i);
gen_val = RandomString(&rnd, kValSize);
ASSERT_OK(Put(gen_key, gen_val));
true_data[gen_key] = gen_val;
}
ASSERT_OK(Flush());
// Separate values and merge operands in different file so that we
// make sure that we dont merge them while flushing but actually
// merge them in the read path
for (int i = 0; i < kNumKeys; i++) {
if (rnd.OneIn(static_cast<int>(100.0 / kNoMergeOpPercentage))) {
// Dont give merge operations for some keys
continue;
}
for (int j = 0; j < kNumMergeOperands; j++) {
gen_key = Key(i);
gen_val = RandomString(&rnd, kValSize);
ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
true_data[gen_key] += "," + gen_val;
}
}
ASSERT_OK(Flush());
for (int i = 0; i < kNumKeys; i++) {
if (rnd.OneIn(static_cast<int>(100.0 / kDeletePercentage))) {
gen_key = Key(i);
ASSERT_OK(Delete(gen_key));
true_data.erase(gen_key);
}
}
ASSERT_OK(Flush());
{
ReadOptions ro;
ro.fill_cache = false;
Iterator* iter = db_->NewIterator(ro);
auto data_iter = true_data.rbegin();
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
ASSERT_EQ(iter->key().ToString(), data_iter->first);
ASSERT_EQ(iter->value().ToString(), data_iter->second);
data_iter++;
}
ASSERT_EQ(data_iter, true_data.rend());
delete iter;
}
}
TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
Options options = CurrentOptions();
options.statistics = rocksdb::CreateDBStatistics();

Loading…
Cancel
Save