From 8dfcfd4e90c61b38691c9acd7ce49accd5d0a071 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 1 Mar 2023 13:28:54 -0800 Subject: [PATCH] Fix backward iteration issue when user defined timestamp is enabled in BlobDB (#11258) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: During backward iteration, blob verification would fail because the user key (ts included) in `saved_key_` doesn't match the blob. This happens because during`FindValueForCurrentKey`, `saved_key_` is not updated when the user key(ts not included) is the same for all cases except when `timestamp_lb_` is specified. This breaks the blob verification logic when user defined timestamp is enabled and `timestamp_lb_` is not specified. Fix this by always updating `saved_key_` when a smaller user key (ts included) is seen. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11258 Test Plan: `make check` `./db_blob_basic_test --gtest_filter=DBBlobWithTimestampTest.IterateBlobs` Run db_bench (built with DEBUG_LEVEL=0) to demonstrate that no overhead is introduced with: `./db_bench -user_timestamp_size=8 -db=/dev/shm/rocksdb -disable_wal=1 -benchmarks=fillseq,seekrandom[-W1-X6] -reverse_iterator=1 -seek_nexts=5` Baseline: - seekrandom [AVG 6 runs] : 72188 (± 1481) ops/sec; 37.2 (± 0.8) MB/sec With this PR: - seekrandom [AVG 6 runs] : 74171 (± 1427) ops/sec; 38.2 (± 0.7) MB/sec Reviewed By: ltamasi Differential Revision: D43675642 Pulled By: jowlyzhang fbshipit-source-id: 8022ae8522d1f66548821855e6eed63640c14e04 --- HISTORY.md | 2 +- db/blob/db_blob_basic_test.cc | 110 ++++++++++++++++++++++++++++++---- db/db_iter.cc | 15 ++--- db/db_iter.h | 3 - 4 files changed, 106 insertions(+), 24 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6a8594233..e1cf2816c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,7 +4,7 @@ * Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys. ### Bug Fixes -* Fixed an issue for backward iteration when `ReadOptions::iter_start_ts` is specified in combination with BlobDB. +* Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB. ### New Features * Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index 4eb5f2107..c6c6d2b93 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -2044,27 +2044,115 @@ TEST_F(DBBlobWithTimestampTest, IterateBlobs) { ReadOptions read_options; std::vector read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)}; - Slice ts_lower_bound(read_timestamps[0]); Slice ts_upper_bound(read_timestamps[1]); - read_options.iter_start_ts = &ts_lower_bound; read_options.timestamp = &ts_upper_bound; auto check_iter_entry = [](const Iterator* iter, const std::string& expected_key, - const std::string& expected_ts, const std::string& expected_value) { + const std::string& expected_ts, const std::string& expected_value, + bool key_is_internal = true) { ASSERT_OK(iter->status()); - std::string expected_ukey_and_ts; - expected_ukey_and_ts.assign(expected_key.data(), expected_key.size()); - expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size()); - - ParsedInternalKey parsed_ikey; - ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey, - true /* log_err_key */)); - ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts); + if (key_is_internal) { + std::string expected_ukey_and_ts; + expected_ukey_and_ts.assign(expected_key.data(), expected_key.size()); + expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + + ParsedInternalKey parsed_ikey; + ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey, + true /* log_err_key */)); + ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts); + } else { + ASSERT_EQ(iter->key(), expected_key); + } ASSERT_EQ(iter->timestamp(), expected_ts); ASSERT_EQ(iter->value(), expected_value); }; + // Forward iterating one version of each key, get in this order: + // [("key0", Timestamp(2, 0), "blob01"), + // ("key1", Timestamp(2, 0), "blob11")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = 0; i < num_blobs; i++) { + check_iter_entry(iter.get(), keys[i], write_timestamps[1], + blobs[i] + std::to_string(1), /*key_is_internal*/ false); + iter->Next(); + } + } + + // Forward iteration, then reverse to backward. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = 0; i < num_blobs * 2 - 1; i++) { + if (i < num_blobs) { + check_iter_entry(iter.get(), keys[i], write_timestamps[1], + blobs[i] + std::to_string(1), + /*key_is_internal*/ false); + if (i != num_blobs - 1) { + iter->Next(); + } + } else { + if (i != num_blobs) { + check_iter_entry(iter.get(), keys[num_blobs * 2 - 1 - i], + write_timestamps[1], + blobs[num_blobs * 2 - 1 - i] + std::to_string(1), + /*key_is_internal*/ false); + } + iter->Prev(); + } + } + } + + // Backward iterating one versions of each key, get in this order: + // [("key4", Timestamp(2, 0), "blob41"), + // ("key3", Timestamp(2, 0), "blob31")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = 0; i < num_blobs; i++) { + check_iter_entry(iter.get(), keys[num_blobs - 1 - i], write_timestamps[1], + blobs[num_blobs - 1 - i] + std::to_string(1), + /*key_is_internal*/ false); + iter->Prev(); + } + } + + // Backward iteration, then reverse to forward. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = 0; i < num_blobs * 2 - 1; i++) { + if (i < num_blobs) { + check_iter_entry(iter.get(), keys[num_blobs - 1 - i], + write_timestamps[1], + blobs[num_blobs - 1 - i] + std::to_string(1), + /*key_is_internal*/ false); + if (i != num_blobs - 1) { + iter->Prev(); + } + } else { + if (i != num_blobs) { + check_iter_entry(iter.get(), keys[i - num_blobs], write_timestamps[1], + blobs[i - num_blobs] + std::to_string(1), + /*key_is_internal*/ false); + } + iter->Next(); + } + } + } + + Slice ts_lower_bound(read_timestamps[0]); + read_options.iter_start_ts = &ts_lower_bound; // Forward iterating multiple versions of the same key, get in this order: // [("key0", Timestamp(2, 0), "blob01"), // ("key0", Timestamp(1, 0), "blob00"), diff --git a/db/db_iter.cc b/db/db_iter.cc index 5ba0b3a0a..32d2ba8a6 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -877,9 +877,14 @@ bool DBIter::FindValueForCurrentKey() { if (timestamp_lb_ != nullptr) { // Only needed when timestamp_lb_ is not null [[maybe_unused]] const bool ret = ParseKey(&ikey_); - saved_ikey_.assign(iter_.key().data(), iter_.key().size()); // Since the preceding ParseKey(&ikey) succeeds, so must this. assert(ret); + saved_key_.SetInternalKey(ikey); + } else if (user_comparator_.Compare(ikey.user_key, + saved_key_.GetUserKey()) < 0) { + saved_key_.SetUserKey( + ikey.user_key, + !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); } valid_entry_seen = true; @@ -964,7 +969,6 @@ bool DBIter::FindValueForCurrentKey() { if (timestamp_lb_ == nullptr) { valid_ = false; } else { - saved_key_.SetInternalKey(saved_ikey_); valid_ = true; } return true; @@ -1010,17 +1014,10 @@ bool DBIter::FindValueForCurrentKey() { } break; case kTypeValue: - if (timestamp_lb_ != nullptr) { - saved_key_.SetInternalKey(saved_ikey_); - } - SetValueAndColumnsFromPlain(pinned_value_); break; case kTypeBlobIndex: - if (timestamp_lb_ != nullptr) { - saved_key_.SetInternalKey(saved_ikey_); - } if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { return false; } diff --git a/db/db_iter.h b/db/db_iter.h index 0fb766fd2..e1663bb7e 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -394,9 +394,6 @@ class DBIter final : public Iterator { const Slice* const timestamp_lb_; const size_t timestamp_size_; std::string saved_timestamp_; - - // Used only if timestamp_lb_ is not nullptr. - std::string saved_ikey_; }; // Return a new iterator that converts internal keys (yielded by