From f007b8fdea38120fbb3868b354b22e93c6d97108 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Wed, 22 Feb 2023 15:44:59 -0800 Subject: [PATCH] Support iter_start_ts in integrated BlobDB (#11244) Summary: Fixed an issue during backward iteration when `iter_start_ts` is set in an integrated BlobDB. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11244 Test Plan: ```make check ./db_blob_basic_test --gtest_filter="DBBlobWithTimestampTest.IterateBlobs" tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --enable_ts whitebox --random_kill_odd 888887 --enable_blob_files=1``` Reviewed By: ltamasi Differential Revision: D43506726 Pulled By: jowlyzhang fbshipit-source-id: 2cdc19ebf8da909d8d43d621353905784949a9f0 --- HISTORY.md | 3 + db/blob/db_blob_basic_test.cc | 128 ++++++++++++++++++++++++++ db/db_iter.cc | 3 + db_stress_tool/db_stress_test_base.cc | 4 +- 4 files changed, 135 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 551537e54..db2c08021 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,9 @@ ### Behavior changes * Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys. +### Bug Fixes +* Fixed an issue for backward iteration when `ReadOptions::iter_start_ts` is specified in combination with BlobDB. + ## 8.0.0 (02/19/2023) ### Behavior changes * `ReadOptions::verify_checksums=false` disables checksum verification for more reads of non-`CacheEntryRole::kDataBlock` blocks. diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index 690a19775..4eb5f2107 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -2011,6 +2011,134 @@ TEST_F(DBBlobWithTimestampTest, MultiGetMergeBlobWithPut) { ASSERT_EQ(values[2], "v2_0"); } +TEST_F(DBBlobWithTimestampTest, IterateBlobs) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + + int num_blobs = 5; + std::vector keys; + std::vector blobs; + + WriteOptions write_opts; + std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(2, 0)}; + + // For each key in ["key0", ... "keyi", ...], write two versions: + // Timestamp(1, 0), "blobi0" + // Timestamp(2, 0), "blobi1" + for (int i = 0; i < num_blobs; i++) { + keys.push_back("key" + std::to_string(i)); + blobs.push_back("blob" + std::to_string(i)); + for (size_t j = 0; j < write_timestamps.size(); j++) { + ASSERT_OK(db_->Put(write_opts, keys[i], write_timestamps[j], + blobs[i] + std::to_string(j))); + } + } + ASSERT_OK(Flush()); + + ReadOptions read_options; + std::vector read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)}; + Slice ts_lower_bound(read_timestamps[0]); + Slice ts_upper_bound(read_timestamps[1]); + read_options.iter_start_ts = &ts_lower_bound; + read_options.timestamp = &ts_upper_bound; + + auto check_iter_entry = + [](const Iterator* iter, const std::string& expected_key, + const std::string& expected_ts, const std::string& expected_value) { + ASSERT_OK(iter->status()); + std::string expected_ukey_and_ts; + expected_ukey_and_ts.assign(expected_key.data(), expected_key.size()); + expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + + ParsedInternalKey parsed_ikey; + ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey, + true /* log_err_key */)); + ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts); + ASSERT_EQ(iter->timestamp(), expected_ts); + ASSERT_EQ(iter->value(), expected_value); + }; + + // Forward iterating multiple versions of the same key, get in this order: + // [("key0", Timestamp(2, 0), "blob01"), + // ("key0", Timestamp(1, 0), "blob00"), + // ("key1", Timestamp(2, 0), "blob11")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = 0; i < num_blobs; i++) { + for (size_t j = write_timestamps.size(); j > 0; --j) { + check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1], + blobs[i] + std::to_string(j - 1)); + iter->Next(); + } + } + } + + // Backward iterating multiple versions of the same key, get in this order: + // [("key4", Timestamp(1, 0), "blob00"), + // ("key4", Timestamp(2, 0), "blob01"), + // ("key3", Timestamp(1, 0), "blob10")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = num_blobs; i > 0; i--) { + for (size_t j = 0; j < write_timestamps.size(); j++) { + check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j], + blobs[i - 1] + std::to_string(j)); + iter->Prev(); + } + } + } + + int upper_bound_idx = num_blobs - 2; + int lower_bound_idx = 1; + Slice upper_bound_slice(keys[upper_bound_idx]); + Slice lower_bound_slice(keys[lower_bound_idx]); + read_options.iterate_upper_bound = &upper_bound_slice; + read_options.iterate_lower_bound = &lower_bound_slice; + + // Forward iteration with upper and lower bound. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = lower_bound_idx; i < upper_bound_idx; i++) { + for (size_t j = write_timestamps.size(); j > 0; --j) { + check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1], + blobs[i] + std::to_string(j - 1)); + iter->Next(); + } + } + } + + // Backward iteration with upper and lower bound. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = upper_bound_idx; i > lower_bound_idx; i--) { + for (size_t j = 0; j < write_timestamps.size(); j++) { + check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j], + blobs[i - 1] + std::to_string(j)); + iter->Prev(); + } + } + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_iter.cc b/db/db_iter.cc index dff251a40..5ba0b3a0a 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -1018,6 +1018,9 @@ bool DBIter::FindValueForCurrentKey() { break; case kTypeBlobIndex: + if (timestamp_lb_ != nullptr) { + saved_key_.SetInternalKey(saved_ikey_); + } if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { return false; } diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 8fe0874f2..b04b29b8c 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2901,9 +2901,7 @@ void StressTest::MaybeUseOlderTimestampForRangeScan(ThreadState* thread, read_opts.timestamp = &ts_slice; // TODO (yanqin): support Merge with iter_start_ts - // TODO (yuzhangyu): support BlobDB with iter_start_ts - if (!thread->rand.OneInOpt(3) || FLAGS_use_merge || FLAGS_use_full_merge_v1 || - FLAGS_enable_blob_files) { + if (!thread->rand.OneInOpt(3) || FLAGS_use_merge || FLAGS_use_full_merge_v1) { return; }