From 443d8ef094f9962944af823054865387fd8e8db5 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Tue, 15 Feb 2022 12:24:05 -0800 Subject: [PATCH] Fix PinSelf() read-after-free in DB::GetMergeOperands() (#9507) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: **Context:** Running the new test `DBMergeOperandTest.MergeOperandReadAfterFreeBug` prior to this fix surfaces the read-after-free bug of PinSef() as below: ``` READ of size 8 at 0x60400002529d thread T0 https://github.com/facebook/rocksdb/issues/5 0x7f199a in rocksdb::PinnableSlice::PinSelf(rocksdb::Slice const&) include/rocksdb/slice.h:171 https://github.com/facebook/rocksdb/issues/6 0x7f199a in rocksdb::DBImpl::GetImpl(rocksdb::ReadOptions const&, rocksdb::Slice const&, rocksdb::DBImpl::GetImplOptions&) db/db_impl/db_impl.cc:1919 https://github.com/facebook/rocksdb/issues/7 0x540d63 in rocksdb::DBImpl::GetMergeOperands(rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Slice const&, rocksdb::PinnableSlice*, rocksdb::GetMergeOperandsOptions*, int*) db/db_impl/db_impl.h:203 freed by thread T0 here: https://github.com/facebook/rocksdb/issues/3 0x1191399 in rocksdb::cache_entry_roles_detail::RegisteredDeleter::Delete(rocksdb::Slice const&, void*) cache/cache_entry_roles.h:99 https://github.com/facebook/rocksdb/issues/4 0x719348 in rocksdb::LRUHandle::Free() cache/lru_cache.h:205 https://github.com/facebook/rocksdb/issues/5 0x71047f in rocksdb::LRUCacheShard::Release(rocksdb::Cache::Handle*, bool) cache/lru_cache.cc:547 https://github.com/facebook/rocksdb/issues/6 0xa78f0a in rocksdb::Cleanable::DoCleanup() include/rocksdb/cleanable.h:60 https://github.com/facebook/rocksdb/issues/7 0xa78f0a in rocksdb::Cleanable::Reset() include/rocksdb/cleanable.h:38 https://github.com/facebook/rocksdb/issues/8 0xa78f0a in rocksdb::PinnedIteratorsManager::ReleasePinnedData() db/pinned_iterators_manager.h:71 https://github.com/facebook/rocksdb/issues/9 0xd0c21b in rocksdb::PinnedIteratorsManager::~PinnedIteratorsManager() db/pinned_iterators_manager.h:24 https://github.com/facebook/rocksdb/issues/10 0xd0c21b in rocksdb::Version::Get(rocksdb::ReadOptions const&, rocksdb::LookupKey const&, rocksdb::PinnableSlice*, std::__cxx11::basic_string, std::allocator >*, rocksdb::Status*, rocksdb::MergeContext*, unsigned long*, bool*, bool*, unsigned long*, rocksdb::ReadCallback*, bool*, bool) db/pinned_iterators_manager.h:22 https://github.com/facebook/rocksdb/issues/11 0x7f0fdf in rocksdb::DBImpl::GetImpl(rocksdb::ReadOptions const&, rocksdb::Slice const&, rocksdb::DBImpl::GetImplOptions&) db/db_impl/db_impl.cc:1886 https://github.com/facebook/rocksdb/issues/12 0x540d63 in rocksdb::DBImpl::GetMergeOperands(rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Slice const&, rocksdb::PinnableSlice*, rocksdb::GetMergeOperandsOptions*, int*) db/db_impl/db_impl.h:203 previously allocated by thread T0 here: https://github.com/facebook/rocksdb/issues/1 0x1239896 in rocksdb::AllocateBlock(unsigned long, **rocksdb::MemoryAllocator*)** memory/memory_allocator.h:35 https://github.com/facebook/rocksdb/issues/2 0x1239896 in rocksdb::BlockFetcher::CopyBufferToHeapBuf() table/block_fetcher.cc:171 https://github.com/facebook/rocksdb/issues/3 0x1239896 in rocksdb::BlockFetcher::GetBlockContents() table/block_fetcher.cc:206 https://github.com/facebook/rocksdb/issues/4 0x122eae5 in rocksdb::BlockFetcher::ReadBlockContents() table/block_fetcher.cc:325 https://github.com/facebook/rocksdb/issues/5 0x11b1f45 in rocksdb::Status rocksdb::BlockBasedTable::MaybeReadBlockAndLoadToCache(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, bool, rocksdb::CachableEntry*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::BlockContents*) const table/block_based/block_based_table_reader.cc:1503 ``` Here is the analysis: - We have [PinnedIteratorsManager](https://github.com/facebook/rocksdb/blob/6.28.fb/db/version_set.cc#L1980) with `Cleanable` capability in our `Version::Get()` path. It's responsible for managing the life-time of pinned iterator and invoking registered cleanup functions during its own destruction. - For example in case above, the merge operands's clean-up gets associated with this manger in [GetContext::push_operand](https://github.com/facebook/rocksdb/blob/6.28.fb/table/get_context.cc#L405). During PinnedIteratorsManager's [destruction](https://github.com/facebook/rocksdb/blob/6.28.fb/db/pinned_iterators_manager.h#L67), the release function associated with those merge operand data is invoked. **And that's what we see in "freed by thread T955 here" in ASAN.** - Bug 🐛: `PinnedIteratorsManager` is local to `Version::Get()` while the data of merge operands need to outlive `Version::Get` and stay till they get [PinSelf()](https://github.com/facebook/rocksdb/blob/6.28.fb/db/db_impl/db_impl.cc#L1905), **which is the read-after-free in ASAN.** - This bug is likely to be an overlook of `PinnedIteratorsManager` when developing the API `DB::GetMergeOperands` cuz the current logic works fine with the existing case of getting the *merged value* where the operands do not need to live that long. - This bug was not surfaced much (even in its unit test) due to the release function associated with the merge operands (which are actually blocks put in cache as you can see in `BlockBasedTable::MaybeReadBlockAndLoadToCache` **in "previously allocated by" in ASAN report**) is a cache entry deleter. The deleter will call `Cache::Release()` which, for LRU cache, won't immediately deallocate the block based on LRU policy [unless the cache is full or being instructed to force erase](https://github.com/facebook/rocksdb/blob/6.28.fb/cache/lru_cache.cc#L521-L531) - `DBMergeOperandTest.MergeOperandReadAfterFreeBug` makes the cache extremely small to force cache full. **Summary:** - Fix the bug by align `PinnedIteratorsManager`'s lifetime with the merge operands Pull Request resolved: https://github.com/facebook/rocksdb/pull/9507 Test Plan: - New test `DBMergeOperandTest.MergeOperandReadAfterFreeBug` - db bench on read path - Setup (LSM tree with several levels, cache the whole db to avoid read IO, warm cache with readseq to avoid read IO): `TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks="fillrandom,readseq -num=1000000 -cache_size=100000000 -write_buffer_size=10000 -statistics=1 -max_bytes_for_level_base=10000 -level0_file_num_compaction_trigger=1``TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks="readrandom" -num=1000000 -cache_size=100000000 ` - Actual command run (run 20-run for 20 times and then average the 20-run's average micros/op) - `for j in {1..20}; do (for i in {1..20}; do rm -rf /dev/shm/rocksdb/ && TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks="fillrandom,readseq,readrandom" -num=1000000 -cache_size=100000000 -write_buffer_size=10000 -statistics=1 -max_bytes_for_level_base=10000 -level0_file_num_compaction_trigger=1 | egrep 'readrandom'; done > rr_output_pre.txt && (awk '{sum+=$3; sum_sqrt+=$3^2}END{print sum/20, sqrt(sum_sqrt/20-(sum/20)^2)}' rr_output_pre.txt) >> rr_output_pre_2.txt); done` - **Result: Pre-change: 3.79193 micros/op; Post-change: 3.79528 micros/op (+0.09%)** (pre-change)sorted avg micros/op of each 20-run | std of micros/op of each 20-run | (post-change) sorted avg micros/op of each 20-run | std of micros/op of each 20-run -- | -- | -- | -- 3.58355 | 0.265209 | 3.48715 | 0.382076 3.58845 | 0.519927 | 3.5832 | 0.382726 3.66415 | 0.452097 | 3.677 | 0.563831 3.68495 | 0.430897 | 3.68405 | 0.495355 3.70295 | 0.482893 | 3.68465 | 0.431438 3.719 | 0.463806 | 3.71945 | 0.457157 3.7393 | 0.453423 | 3.72795 | 0.538604 3.7806 | 0.527613 | 3.75075 | 0.444509 3.7817 | 0.426704 | 3.7683 | 0.468065 3.809 | 0.381033 | 3.8086 | 0.557378 3.80985 | 0.466011 | 3.81805 | 0.524833 3.8165 | 0.500351 | 3.83405 | 0.529339 3.8479 | 0.430326 | 3.86285 | 0.44831 3.85125 | 0.434108 | 3.8717 | 0.544098 3.8556 | 0.524602 | 3.895 | 0.411679 3.8656 | 0.476383 | 3.90965 | 0.566636 3.8911 | 0.488477 | 3.92735 | 0.608038 3.898 | 0.493978 | 3.9439 | 0.524511 3.97235 | 0.515008 | 3.9623 | 0.477416 3.9768 | 0.519993 | 3.98965 | 0.521481 - CI Reviewed By: ajkr Differential Revision: D34030519 Pulled By: hx235 fbshipit-source-id: a99ac585c11704c5ed93af033cb29ba0a7b16ae8 --- HISTORY.md | 1 + db/db_impl/db_impl.cc | 21 +++++++++++------- db/db_impl/db_impl_readonly.cc | 3 ++- db/db_impl/db_impl_secondary.cc | 3 ++- db/db_merge_operand_test.cc | 39 +++++++++++++++++++++++++++++++++ db/version_set.cc | 9 ++++---- db/version_set.h | 2 ++ 7 files changed, 64 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 337981cfc..8ae52b56e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,6 +3,7 @@ ### Bug Fixes * Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.) * Fixed more cases of EventListener::OnTableFileCreated called with OK status, file_size==0, and no SST file kept. Now the status is Aborted. +* Fixed a read-after-free bug in `DB::GetMergeOperands()`. ### Performance Improvements * Mitigated the overhead of building the file location hash table used by the online LSM tree consistency checks, which can improve performance for certain workloads (see #9351). diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 03fa01da5..2a822880d 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1880,11 +1880,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, return s; } } + PinnedIteratorsManager pinned_iters_mgr; if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); sv->current->Get( read_options, lkey, get_impl_options.value, timestamp, &s, - &merge_context, &max_covering_tombstone_seq, + &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, get_impl_options.get_value ? get_impl_options.value_found : nullptr, nullptr, nullptr, get_impl_options.get_value ? get_impl_options.callback : nullptr, @@ -2076,11 +2077,13 @@ std::vector DBImpl::MultiGet( if (!done) { PinnableSlice pinnable_val; PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->Get( - read_options, lkey, &pinnable_val, timestamp, &s, &merge_context, - &max_covering_tombstone_seq, /*value_found=*/nullptr, - /*key_exists=*/nullptr, - /*seq=*/nullptr, read_callback); + PinnedIteratorsManager pinned_iters_mgr; + super_version->current->Get(read_options, lkey, &pinnable_val, timestamp, + &s, &merge_context, + &max_covering_tombstone_seq, + &pinned_iters_mgr, /*value_found=*/nullptr, + /*key_exists=*/nullptr, + /*seq=*/nullptr, read_callback); value->assign(pinnable_val.data(), pinnable_val.size()); RecordTick(stats_, MEMTABLE_MISS); } @@ -4573,10 +4576,12 @@ Status DBImpl::GetLatestSequenceForKey( // SST files if cache_only=true? if (!cache_only) { // Check tables + PinnedIteratorsManager pinned_iters_mgr; sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s, &merge_context, &max_covering_tombstone_seq, - nullptr /* value_found */, found_record_for_key, seq, - nullptr /*read_callback*/, is_blob_index); + &pinned_iters_mgr, nullptr /* value_found */, + found_record_for_key, seq, nullptr /*read_callback*/, + is_blob_index); if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { // unexpected error reading SST files diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index d4a92db08..96d409e8a 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -67,9 +67,10 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, RecordTick(stats_, MEMTABLE_HIT); } else { PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; super_version->current->Get(read_options, lkey, pinnable_val, /*timestamp=*/nullptr, &s, &merge_context, - &max_covering_tombstone_seq); + &max_covering_tombstone_seq, &pinned_iters_mgr); RecordTick(stats_, MEMTABLE_MISS); } RecordTick(stats_, NUMBER_KEYS_READ); diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 39250ecfa..35d51721f 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -385,9 +385,10 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, } if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); + PinnedIteratorsManager pinned_iters_mgr; super_version->current->Get(read_options, lkey, pinnable_val, /*timestamp=*/nullptr, &s, &merge_context, - &max_covering_tombstone_seq); + &max_covering_tombstone_seq, &pinned_iters_mgr); RecordTick(stats_, MEMTABLE_MISS); } { diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc index ecbbae780..f00c6872f 100644 --- a/db/db_merge_operand_test.cc +++ b/db/db_merge_operand_test.cc @@ -47,6 +47,45 @@ class DBMergeOperandTest : public DBTestBase { : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {} }; +TEST_F(DBMergeOperandTest, MergeOperandReadAfterFreeBug) { + // There was a bug of reading merge operands after they are mistakely freed + // in DB::GetMergeOperands, which is surfaced by cache full. + // See PR#9507 for more. + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.env = env_; + BlockBasedTableOptions table_options; + + // Small cache to simulate cache full + table_options.block_cache = NewLRUCache(1); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + ASSERT_OK(Merge("k1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v3")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k1", "v4")); + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(number_of_operands, 4); + ASSERT_EQ(values[0].ToString(), "v1"); + ASSERT_EQ(values[1].ToString(), "v2"); + ASSERT_EQ(values[2].ToString(), "v3"); + ASSERT_EQ(values[3].ToString(), "v4"); +} + TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { Options options; options.create_if_missing = true; diff --git a/db/version_set.cc b/db/version_set.cc index 09429ae0b..1bef2a46c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1965,7 +1965,8 @@ void Version::MultiGetBlob( void Version::Get(const ReadOptions& read_options, const LookupKey& k, PinnableSlice* value, std::string* timestamp, Status* status, MergeContext* merge_context, - SequenceNumber* max_covering_tombstone_seq, bool* value_found, + SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, bool* is_blob, bool do_merge) { Slice ikey = k.internal_key(); @@ -1978,7 +1979,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, *key_exists = true; } - PinnedIteratorsManager pinned_iters_mgr; uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; if (vset_ && vset_->block_cache_tracer_ && vset_->block_cache_tracer_->is_tracing_enabled()) { @@ -1992,17 +1992,18 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; BlobFetcher blob_fetcher(this, read_options); + assert(pinned_iters_mgr); GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found, merge_context, do_merge, max_covering_tombstone_seq, clock_, seq, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use, + merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use, tracing_get_id, &blob_fetcher); // Pin blocks that we read to hold merge operands if (merge_operator_) { - pinned_iters_mgr.StartPinning(); + pinned_iters_mgr->StartPinning(); } FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_, diff --git a/db/version_set.h b/db/version_set.h index e9d789954..38e6616cf 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -734,9 +734,11 @@ class Version { // If the key has any merge operands then store them in // merge_context.operands_list and don't merge the operands // REQUIRES: lock is not held + // REQUIRES: pinned_iters_mgr != nullptr void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, std::string* timestamp, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + PinnedIteratorsManager* pinned_iters_mgr, bool* value_found = nullptr, bool* key_exists = nullptr, SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, bool* is_blob = nullptr, bool do_merge = true);