diff --git a/HISTORY.md b/HISTORY.md index 337981cfc..8ae52b56e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ### Bug Fixes * Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.) * Fixed more cases of EventListener::OnTableFileCreated called with OK status, file_size==0, and no SST file kept. Now the status is Aborted. +* Fixed a read-after-free bug in `DB::GetMergeOperands()`. ### Performance Improvements * Mitigated the overhead of building the file location hash table used by the online LSM tree consistency checks, which can improve performance for certain workloads (see #9351). diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 03fa01da5..2a822880d 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1880,11 +1880,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, return s; } } + PinnedIteratorsManager pinned_iters_mgr; if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); sv->current->Get( read_options, lkey, get_impl_options.value, timestamp, &s, - &merge_context, &max_covering_tombstone_seq, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, get_impl_options.get_value ? get_impl_options.value_found : nullptr, nullptr, nullptr, get_impl_options.get_value ? get_impl_options.callback : nullptr, @@ -2076,11 +2077,13 @@ std::vector DBImpl::MultiGet( if (!done) { PinnableSlice pinnable_val; PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->Get( - read_options, lkey, &pinnable_val, timestamp, &s, &merge_context, - &max_covering_tombstone_seq, /*value_found=*/nullptr, - /*key_exists=*/nullptr, - /*seq=*/nullptr, read_callback); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, &pinnable_val, timestamp, + &s, &merge_context, + &max_covering_tombstone_seq, + &pinned_iters_mgr, /*value_found=*/nullptr, + /*key_exists=*/nullptr, + /*seq=*/nullptr, read_callback); value->assign(pinnable_val.data(), pinnable_val.size()); RecordTick(stats_, MEMTABLE_MISS); } @@ -4573,10 +4576,12 @@ Status DBImpl::GetLatestSequenceForKey( // SST files if cache_only=true? if (!cache_only) { // Check tables + PinnedIteratorsManager pinned_iters_mgr; sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s, &merge_context, &max_covering_tombstone_seq, - nullptr /* value_found */, found_record_for_key, seq, - nullptr /*read_callback*/, is_blob_index); + &pinned_iters_mgr, nullptr /* value_found */, + found_record_for_key, seq, nullptr /*read_callback*/, + is_blob_index); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading SST files diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index d4a92db08..96d409e8a 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -67,9 +67,10 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, RecordTick(stats_, MEMTABLE_HIT); } else { PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; super_version->current->Get(read_options, lkey, pinnable_val, /*timestamp=*/nullptr, &s, &merge_context, - &max_covering_tombstone_seq); + &max_covering_tombstone_seq, &pinned_iters_mgr); RecordTick(stats_, MEMTABLE_MISS); } RecordTick(stats_, NUMBER_KEYS_READ); diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 39250ecfa..35d51721f 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -385,9 +385,10 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, } if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; super_version->current->Get(read_options, lkey, pinnable_val, /*timestamp=*/nullptr, &s, &merge_context, - &max_covering_tombstone_seq); + &max_covering_tombstone_seq, &pinned_iters_mgr); RecordTick(stats_, MEMTABLE_MISS); } { diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc index ecbbae780..f00c6872f 100644 --- a/db/db_merge_operand_test.cc +++ b/db/db_merge_operand_test.cc @@ -47,6 +47,45 @@ class DBMergeOperandTest : public DBTestBase { : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {} }; +TEST_F(DBMergeOperandTest, MergeOperandReadAfterFreeBug) { + // There was a bug of reading merge operands after they are mistakely freed + // in DB::GetMergeOperands, which is surfaced by cache full. + // See PR#9507 for more. + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.env = env_; + BlockBasedTableOptions table_options; + + // Small cache to simulate cache full + table_options.block_cache = NewLRUCache(1); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v4")); + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(number_of_operands, 4); + ASSERT_EQ(values[0].ToString(), "v1"); + ASSERT_EQ(values[1].ToString(), "v2"); + ASSERT_EQ(values[2].ToString(), "v3"); + ASSERT_EQ(values[3].ToString(), "v4"); +} + TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { Options options; options.create_if_missing = true; diff --git a/db/version_set.cc b/db/version_set.cc index 09429ae0b..1bef2a46c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1965,7 +1965,8 @@ void Version::MultiGetBlob( void Version::Get(const ReadOptions& read_options, const LookupKey& k, PinnableSlice* value, std::string* timestamp, Status* status, MergeContext* merge_context, - SequenceNumber* max_covering_tombstone_seq, bool* value_found, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, bool* is_blob, bool do_merge) { Slice ikey = k.internal_key(); @@ -1978,7 +1979,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, *key_exists = true; } - PinnedIteratorsManager pinned_iters_mgr; uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; if (vset_ && vset_->block_cache_tracer_ && vset_->block_cache_tracer_->is_tracing_enabled()) { @@ -1992,17 +1992,18 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; BlobFetcher blob_fetcher(this, read_options); + assert(pinned_iters_mgr); GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found, merge_context, do_merge, max_covering_tombstone_seq, clock_, seq, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use, + merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use, tracing_get_id, &blob_fetcher); // Pin blocks that we read to hold merge operands if (merge_operator_) { - pinned_iters_mgr.StartPinning(); + pinned_iters_mgr->StartPinning(); } FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, diff --git a/db/version_set.h b/db/version_set.h index e9d789954..38e6616cf 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -734,9 +734,11 @@ class Version { // If the key has any merge operands then store them in // merge_context.operands_list and don't merge the operands // REQUIRES: lock is not held + // REQUIRES: pinned_iters_mgr != nullptr void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, std::string* timestamp, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found = nullptr, bool* key_exists = nullptr, SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, bool* is_blob = nullptr, bool do_merge = true);