From 06b04127a836ef67b1c6ec3d906668ed4fb199ec Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Tue, 9 Aug 2022 11:39:57 -0700 Subject: [PATCH] Reset blob value as soon as it's not needed in DBIter (#10490) Summary: We have recently added caching support to BlobDB, and separately, implemented an optimization where reading blobs from the cache results in the cache handle being transferred to the target `PinnableSlice` (as opposed to the contents getting copied). With these changes, it makes sense to reset the `PinnableSlice` storing the blob value in `DBIter` as soon as we move to a different iterator position to prevent us from holding on to the cache handle any longer than necessary. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10490 Test Plan: `make check` Reviewed By: akankshamahajan15 Differential Revision: D38473630 Pulled By: ltamasi fbshipit-source-id: 84c045ffac76436c6152fd0f5775b007f4051386 --- HISTORY.md | 1 + db/blob/db_blob_basic_test.cc | 142 ++++++++++++++++++++++++++++++++++ db/db_iter.cc | 26 +++++-- db/db_iter.h | 5 ++ 4 files changed, 168 insertions(+), 6 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 478c0fdf5..bfe72830d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -34,6 +34,7 @@ ### Performance Improvements * Instead of constructing `FragmentedRangeTombstoneList` during every read operation, it is now constructed once and stored in immutable memtables. This improves speed of querying range tombstones from immutable memtables. * Improve read performance by avoiding dynamic memory allocation. +* When using iterators with the integrated BlobDB implementation, blob cache handles are now released immediately when the iterator's position changes. ## 7.5.0 (07/15/2022) ### New Features diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index 95ed18253..e3fa4a669 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -227,6 +227,148 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCache) { } } +TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + + LRUCacheOptions cache_options; + cache_options.capacity = 2048; + cache_options.num_shard_bits = 0; + cache_options.metadata_charge_policy = kDontChargeCacheMetadata; + + options.blob_cache = NewLRUCache(cache_options); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then iterate over three key-values. The second value is below the size + // limit and is thus stored inline; the other two are stored separately as + // blobs. We expect to have something pinned in the cache iff we are + // positioned on a blob. + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "long_value"; + static_assert(sizeof(first_value) - 1 >= min_blob_size, + "first_value too short to be stored as blob"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "short"; + static_assert(sizeof(second_value) - 1 < min_blob_size, + "second_value too long to be inlined"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + { + ReadOptions read_options; + read_options.fill_cache = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + { + ReadOptions read_options; + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + } + + { + ReadOptions read_options; + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + } +} + TEST_F(DBBlobBasicTest, MultiGetBlobs) { constexpr size_t min_blob_size = 6; diff --git a/db/db_iter.cc b/db/db_iter.cc index 4f5c5b08b..c9fdfab45 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -134,6 +134,7 @@ void DBIter::Next() { PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); // Release temporarily pinned blocks from last operation ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); local_stats_.skip_count_ += num_internal_keys_skipped_; local_stats_.skip_count_--; @@ -177,6 +178,7 @@ void DBIter::Next() { bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index) { assert(!is_blob_); + assert(blob_value_.empty()); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -216,6 +218,7 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, bool DBIter::SetWideColumnValueIfNeeded(const Slice& wide_columns_slice) { assert(!is_blob_); + assert(blob_value_.empty()); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -280,7 +283,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, // to one. bool reseek_done = false; - is_blob_ = false; + assert(!is_blob_); + assert(blob_value_.empty()); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -611,7 +615,7 @@ bool DBIter::MergeValuesNewToOld() { return false; } - is_blob_ = false; + ResetBlobValue(); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -660,6 +664,7 @@ void DBIter::Prev() { PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); ResetInternalKeysSkippedCounter(); bool ok = true; @@ -990,7 +995,9 @@ bool DBIter::FindValueForCurrentKey() { Status s; s.PermitUncheckedError(); - is_blob_ = false; + + assert(!is_blob_); + assert(blob_value_.empty()); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -1033,7 +1040,7 @@ bool DBIter::FindValueForCurrentKey() { return false; } - is_blob_ = false; + ResetBlobValue(); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -1111,7 +1118,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { // In case read_callback presents, the value we seek to may not be visible. // Find the next value that's visible. ParsedInternalKey ikey; - is_blob_ = false; + + assert(!is_blob_); + assert(blob_value_.empty()); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -1250,7 +1259,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return false; } - is_blob_ = false; + ResetBlobValue(); assert(!is_wide_); assert(value_of_default_column_.empty()); @@ -1478,6 +1487,7 @@ void DBIter::Seek(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); ResetInternalKeysSkippedCounter(); @@ -1555,6 +1565,7 @@ void DBIter::SeekForPrev(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); ResetInternalKeysSkippedCounter(); @@ -1613,6 +1624,7 @@ void DBIter::SeekToFirst() { status_ = Status::OK(); direction_ = kForward; ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); @@ -1661,6 +1673,7 @@ void DBIter::SeekToLast() { *iterate_upper_bound_, /*a_has_ts=*/false, k, /*b_has_ts=*/false)) { ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); PrevInternal(nullptr); @@ -1681,6 +1694,7 @@ void DBIter::SeekToLast() { status_ = Status::OK(); direction_ = kReverse; ReleaseTempPinnedData(); + ResetBlobValue(); ResetWideColumnValue(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); diff --git a/db/db_iter.h b/db/db_iter.h index 5e0ff2031..998486fd4 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -303,6 +303,11 @@ class DBIter final : public Iterator { // index when using the integrated BlobDB implementation. bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index); + void ResetBlobValue() { + is_blob_ = false; + blob_value_.Reset(); + } + bool SetWideColumnValueIfNeeded(const Slice& wide_columns_slice); void ResetWideColumnValue() {