From f10f1359381d932a21fd3d9b6c755329138bb73e Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 27 Jan 2020 15:41:57 -0800 Subject: [PATCH] Fix regression bug of hash index with iterator total order seek (#6328) Summary: https://github.com/facebook/rocksdb/pull/6028 introduces a bug for hash index in SST files. If a table reader is created when total order seek is used, prefix_extractor might be passed into table reader as null. While later when prefix seek is used, the same table reader used, hash index is checked but prefix extractor is null and the program would crash. Fix the issue by fixing http://github.com/facebook/rocksdb/pull/6028 in the way that prefix_extractor is preserved but ReadOptions.total_order_seek is checked Also, a null pointer check is added so that a bug like this won't cause segfault in the future. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6328 Test Plan: Add a unit test that would fail without the fix. Stress test that reproduces the crash would pass. Differential Revision: D19586751 fbshipit-source-id: 8de77690167ddf5a77a01e167cf89430b1bfba42 --- HISTORY.md | 1 + db/db_test.cc | 51 +++++++++++++++++++ db/version_set.cc | 9 ++-- table/block_based/block_based_table_reader.cc | 31 +++++++---- table/block_based/block_based_table_reader.h | 1 + 5 files changed, 81 insertions(+), 12 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 50916c39a..6ef09b2c3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ * Fixed an issue where the thread pools were not resized upon setting `max_background_jobs` dynamically through the `SetDBOptions` interface. * Fix a bug that can cause write threads to hang when a slowdown/stall happens and there is a mix of writers with WriteOptions::no_slowdown set/unset. * Fixed an issue where an incorrect "number of input records" value was used to compute the "records dropped" statistics for compactions. +* Fix a regression bug that causes segfault when hash is used, max_open_files != -1 and total order seek is used and switched back. ### New Features * It is now possible to enable periodic compactions for the base DB when using BlobDB. diff --git a/db/db_test.cc b/db/db_test.cc index 28d886208..017df2ef5 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3174,6 +3174,57 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v2", Get("k2")); } +TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + options.max_open_files = 10; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + // RocksDB sanitize max open files to at least 20. Modify it back. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 11; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + Flush(); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Force evict tables + dbfull()->TEST_table_cache()->SetCapacity(0); + // Make table cache to keep one entry. + dbfull()->TEST_table_cache()->SetCapacity(1); + + ReadOptions read_options; + read_options.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + + // After total order seek, prefix index should still be used. + read_options.total_order_seek = false; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest, ChecksumTest) { BlockBasedTableOptions table_options; Options options = CurrentOptions(); diff --git a/db/version_set.cc b/db/version_set.cc index 9972dc8a2..217b7bad3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -867,8 +867,7 @@ class LevelIterator final : public InternalIterator { icomparator_(icomparator), user_comparator_(icomparator.user_comparator()), flevel_(flevel), - prefix_extractor_(read_options.total_order_seek ? nullptr - : prefix_extractor), + prefix_extractor_(prefix_extractor), file_read_hist_(file_read_hist), should_sample_(should_sample), caller_(caller), @@ -1003,6 +1002,9 @@ class LevelIterator final : public InternalIterator { const UserComparatorWrapper user_comparator_; const LevelFilesBrief* flevel_; mutable FileDescriptor current_value_; + // `prefix_extractor_` may be non-null even for total order seek. Checking + // this variable is not the right way to identify whether prefix iterator + // is used. const SliceTransform* prefix_extractor_; HistogramImpl* file_read_hist_; @@ -1045,7 +1047,8 @@ void LevelIterator::Seek(const Slice& target) { file_iter_.Seek(target); } if (SkipEmptyFileForward() && prefix_extractor_ != nullptr && - file_iter_.iter() != nullptr && file_iter_.Valid()) { + !read_options_.total_order_seek && file_iter_.iter() != nullptr && + file_iter_.Valid()) { // We've skipped the file we initially positioned to. In the prefix // seek case, it is likely that the file is skipped because of // prefix bloom or hash, where more keys are skipped. We then check diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 7adc2277c..772a88a36 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -718,6 +718,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { } BlockPrefixIndex* prefix_index = nullptr; + assert(rep->internal_prefix_transform.get() != nullptr); s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), prefixes_contents.data, prefixes_meta_contents.data, &prefix_index); @@ -1187,8 +1188,10 @@ Status BlockBasedTable::Open( rep->hash_index_allow_collision = table_options.hash_index_allow_collision; // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. - rep->internal_prefix_transform.reset( - new InternalKeySliceTransform(prefix_extractor)); + if (prefix_extractor != nullptr) { + rep->internal_prefix_transform.reset( + new InternalKeySliceTransform(prefix_extractor)); + } SetupCacheKeyPrefix(rep); std::unique_ptr new_table( new BlockBasedTable(rep, block_cache_tracer)); @@ -4049,7 +4052,13 @@ Status BlockBasedTable::CreateIndexReader( std::unique_ptr metaindex_guard; std::unique_ptr metaindex_iter_guard; auto meta_index_iter = preloaded_meta_index_iter; - if (meta_index_iter == nullptr) { + bool should_fallback = false; + if (rep_->internal_prefix_transform.get() == nullptr) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "No prefix extractor passed in. Fall back to binary" + " search index."); + should_fallback = true; + } else if (meta_index_iter == nullptr) { auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard, &metaindex_iter_guard); if (!s.ok()) { @@ -4058,16 +4067,20 @@ Status BlockBasedTable::CreateIndexReader( ROCKS_LOG_WARN(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary search index."); - return BinarySearchIndexReader::Create(this, prefetch_buffer, - use_cache, prefetch, pin, - lookup_context, index_reader); + should_fallback = true; } meta_index_iter = metaindex_iter_guard.get(); } - return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, - use_cache, prefetch, pin, lookup_context, - index_reader); + if (should_fallback) { + return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } else { + return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, + use_cache, prefetch, pin, lookup_context, + index_reader); + } } default: { std::string error_message = diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index f54c45fff..a77a63df5 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -548,6 +548,7 @@ struct BlockBasedTable::Rep { // module should not be relying on db module. However to make things easier // and compatible with existing code, we introduce a wrapper that allows // block to extract prefix without knowing if a key is internal or not. + // null if no prefix_extractor is passed in when opening the table reader. std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor;