Fix backward iteration issue when user defined timestamp is enabled in BlobDB (#11258)

Summary:
During backward iteration, blob verification would fail because the user key (ts included) in `saved_key_` doesn't match the blob. This happens because during`FindValueForCurrentKey`, `saved_key_` is not updated when the user key(ts not included) is the same for all cases except when `timestamp_lb_` is specified. This breaks the blob verification logic when user defined timestamp is enabled and `timestamp_lb_` is not specified. Fix this by always updating `saved_key_` when a smaller user key (ts included) is seen.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11258

Test Plan:
`make check`
`./db_blob_basic_test --gtest_filter=DBBlobWithTimestampTest.IterateBlobs`

Run db_bench (built with DEBUG_LEVEL=0) to demonstrate that no overhead is introduced with:

`./db_bench -user_timestamp_size=8  -db=/dev/shm/rocksdb -disable_wal=1 -benchmarks=fillseq,seekrandom[-W1-X6] -reverse_iterator=1 -seek_nexts=5`

Baseline:

- seekrandom [AVG    6 runs] : 72188 (± 1481) ops/sec;   37.2 (± 0.8) MB/sec

With this PR:

- seekrandom [AVG    6 runs] : 74171 (± 1427) ops/sec;   38.2 (± 0.7) MB/sec

Reviewed By: ltamasi

Differential Revision: D43675642

Pulled By: jowlyzhang

fbshipit-source-id: 8022ae8522d1f66548821855e6eed63640c14e04
oxigraph-8.1.1
Yu Zhang 2 years ago committed by Facebook GitHub Bot
parent cf09917c18
commit 8dfcfd4e90
  1. 2
      HISTORY.md
  2. 110
      db/blob/db_blob_basic_test.cc
  3. 15
      db/db_iter.cc
  4. 3
      db/db_iter.h

@ -4,7 +4,7 @@
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
### Bug Fixes
* Fixed an issue for backward iteration when `ReadOptions::iter_start_ts` is specified in combination with BlobDB.
* Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB.
### New Features
* Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits

@ -2044,27 +2044,115 @@ TEST_F(DBBlobWithTimestampTest, IterateBlobs) {
ReadOptions read_options;
std::vector<std::string> read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)};
Slice ts_lower_bound(read_timestamps[0]);
Slice ts_upper_bound(read_timestamps[1]);
read_options.iter_start_ts = &ts_lower_bound;
read_options.timestamp = &ts_upper_bound;
auto check_iter_entry =
[](const Iterator* iter, const std::string& expected_key,
const std::string& expected_ts, const std::string& expected_value) {
const std::string& expected_ts, const std::string& expected_value,
bool key_is_internal = true) {
ASSERT_OK(iter->status());
std::string expected_ukey_and_ts;
expected_ukey_and_ts.assign(expected_key.data(), expected_key.size());
expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size());
ParsedInternalKey parsed_ikey;
ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey,
true /* log_err_key */));
ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts);
if (key_is_internal) {
std::string expected_ukey_and_ts;
expected_ukey_and_ts.assign(expected_key.data(), expected_key.size());
expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size());
ParsedInternalKey parsed_ikey;
ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey,
true /* log_err_key */));
ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts);
} else {
ASSERT_EQ(iter->key(), expected_key);
}
ASSERT_EQ(iter->timestamp(), expected_ts);
ASSERT_EQ(iter->value(), expected_value);
};
// Forward iterating one version of each key, get in this order:
// [("key0", Timestamp(2, 0), "blob01"),
// ("key1", Timestamp(2, 0), "blob11")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs; i++) {
check_iter_entry(iter.get(), keys[i], write_timestamps[1],
blobs[i] + std::to_string(1), /*key_is_internal*/ false);
iter->Next();
}
}
// Forward iteration, then reverse to backward.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs * 2 - 1; i++) {
if (i < num_blobs) {
check_iter_entry(iter.get(), keys[i], write_timestamps[1],
blobs[i] + std::to_string(1),
/*key_is_internal*/ false);
if (i != num_blobs - 1) {
iter->Next();
}
} else {
if (i != num_blobs) {
check_iter_entry(iter.get(), keys[num_blobs * 2 - 1 - i],
write_timestamps[1],
blobs[num_blobs * 2 - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
}
iter->Prev();
}
}
}
// Backward iterating one versions of each key, get in this order:
// [("key4", Timestamp(2, 0), "blob41"),
// ("key3", Timestamp(2, 0), "blob31")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = 0; i < num_blobs; i++) {
check_iter_entry(iter.get(), keys[num_blobs - 1 - i], write_timestamps[1],
blobs[num_blobs - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
iter->Prev();
}
}
// Backward iteration, then reverse to forward.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = 0; i < num_blobs * 2 - 1; i++) {
if (i < num_blobs) {
check_iter_entry(iter.get(), keys[num_blobs - 1 - i],
write_timestamps[1],
blobs[num_blobs - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
if (i != num_blobs - 1) {
iter->Prev();
}
} else {
if (i != num_blobs) {
check_iter_entry(iter.get(), keys[i - num_blobs], write_timestamps[1],
blobs[i - num_blobs] + std::to_string(1),
/*key_is_internal*/ false);
}
iter->Next();
}
}
}
Slice ts_lower_bound(read_timestamps[0]);
read_options.iter_start_ts = &ts_lower_bound;
// Forward iterating multiple versions of the same key, get in this order:
// [("key0", Timestamp(2, 0), "blob01"),
// ("key0", Timestamp(1, 0), "blob00"),

@ -877,9 +877,14 @@ bool DBIter::FindValueForCurrentKey() {
if (timestamp_lb_ != nullptr) {
// Only needed when timestamp_lb_ is not null
[[maybe_unused]] const bool ret = ParseKey(&ikey_);
saved_ikey_.assign(iter_.key().data(), iter_.key().size());
// Since the preceding ParseKey(&ikey) succeeds, so must this.
assert(ret);
saved_key_.SetInternalKey(ikey);
} else if (user_comparator_.Compare(ikey.user_key,
saved_key_.GetUserKey()) < 0) {
saved_key_.SetUserKey(
ikey.user_key,
!pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
}
valid_entry_seen = true;
@ -964,7 +969,6 @@ bool DBIter::FindValueForCurrentKey() {
if (timestamp_lb_ == nullptr) {
valid_ = false;
} else {
saved_key_.SetInternalKey(saved_ikey_);
valid_ = true;
}
return true;
@ -1010,17 +1014,10 @@ bool DBIter::FindValueForCurrentKey() {
}
break;
case kTypeValue:
if (timestamp_lb_ != nullptr) {
saved_key_.SetInternalKey(saved_ikey_);
}
SetValueAndColumnsFromPlain(pinned_value_);
break;
case kTypeBlobIndex:
if (timestamp_lb_ != nullptr) {
saved_key_.SetInternalKey(saved_ikey_);
}
if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
return false;
}

@ -394,9 +394,6 @@ class DBIter final : public Iterator {
const Slice* const timestamp_lb_;
const size_t timestamp_size_;
std::string saved_timestamp_;
// Used only if timestamp_lb_ is not nullptr.
std::string saved_ikey_;
};
// Return a new iterator that converts internal keys (yielded by

Loading…
Cancel
Save