diff --git a/HISTORY.md b/HISTORY.md index 50916c39a..6ef09b2c3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ * Fixed an issue where the thread pools were not resized upon setting `max_background_jobs` dynamically through the `SetDBOptions` interface. * Fix a bug that can cause write threads to hang when a slowdown/stall happens and there is a mix of writers with WriteOptions::no_slowdown set/unset. * Fixed an issue where an incorrect "number of input records" value was used to compute the "records dropped" statistics for compactions. +* Fix a regression bug that causes segfault when hash is used, max_open_files != -1 and total order seek is used and switched back. ### New Features * It is now possible to enable periodic compactions for the base DB when using BlobDB. diff --git a/db/db_test.cc b/db/db_test.cc index 28d886208..017df2ef5 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3174,6 +3174,57 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v2", Get("k2")); } +TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + options.max_open_files = 10; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + // RocksDB sanitize max open files to at least 20. Modify it back. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 11; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(options); + ASSERT_OK(Put("k1", "v1")); + Flush(); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 1; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Force evict tables + dbfull()->TEST_table_cache()->SetCapacity(0); + // Make table cache to keep one entry. + dbfull()->TEST_table_cache()->SetCapacity(1); + + ReadOptions read_options; + read_options.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + + // After total order seek, prefix index should still be used. + read_options.total_order_seek = false; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("k1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k1", iter->key().ToString()); + } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest, ChecksumTest) { BlockBasedTableOptions table_options; Options options = CurrentOptions(); diff --git a/db/version_set.cc b/db/version_set.cc index 9972dc8a2..217b7bad3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -867,8 +867,7 @@ class LevelIterator final : public InternalIterator { icomparator_(icomparator), user_comparator_(icomparator.user_comparator()), flevel_(flevel), - prefix_extractor_(read_options.total_order_seek ? nullptr - : prefix_extractor), + prefix_extractor_(prefix_extractor), file_read_hist_(file_read_hist), should_sample_(should_sample), caller_(caller), @@ -1003,6 +1002,9 @@ class LevelIterator final : public InternalIterator { const UserComparatorWrapper user_comparator_; const LevelFilesBrief* flevel_; mutable FileDescriptor current_value_; + // `prefix_extractor_` may be non-null even for total order seek. Checking + // this variable is not the right way to identify whether prefix iterator + // is used. const SliceTransform* prefix_extractor_; HistogramImpl* file_read_hist_; @@ -1045,7 +1047,8 @@ void LevelIterator::Seek(const Slice& target) { file_iter_.Seek(target); } if (SkipEmptyFileForward() && prefix_extractor_ != nullptr && - file_iter_.iter() != nullptr && file_iter_.Valid()) { + !read_options_.total_order_seek && file_iter_.iter() != nullptr && + file_iter_.Valid()) { // We've skipped the file we initially positioned to. In the prefix // seek case, it is likely that the file is skipped because of // prefix bloom or hash, where more keys are skipped. We then check diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 7adc2277c..772a88a36 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -718,6 +718,7 @@ class HashIndexReader : public BlockBasedTable::IndexReaderCommon { } BlockPrefixIndex* prefix_index = nullptr; + assert(rep->internal_prefix_transform.get() != nullptr); s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), prefixes_contents.data, prefixes_meta_contents.data, &prefix_index); @@ -1187,8 +1188,10 @@ Status BlockBasedTable::Open( rep->hash_index_allow_collision = table_options.hash_index_allow_collision; // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. - rep->internal_prefix_transform.reset( - new InternalKeySliceTransform(prefix_extractor)); + if (prefix_extractor != nullptr) { + rep->internal_prefix_transform.reset( + new InternalKeySliceTransform(prefix_extractor)); + } SetupCacheKeyPrefix(rep); std::unique_ptr new_table( new BlockBasedTable(rep, block_cache_tracer)); @@ -4049,7 +4052,13 @@ Status BlockBasedTable::CreateIndexReader( std::unique_ptr metaindex_guard; std::unique_ptr metaindex_iter_guard; auto meta_index_iter = preloaded_meta_index_iter; - if (meta_index_iter == nullptr) { + bool should_fallback = false; + if (rep_->internal_prefix_transform.get() == nullptr) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "No prefix extractor passed in. Fall back to binary" + " search index."); + should_fallback = true; + } else if (meta_index_iter == nullptr) { auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard, &metaindex_iter_guard); if (!s.ok()) { @@ -4058,16 +4067,20 @@ Status BlockBasedTable::CreateIndexReader( ROCKS_LOG_WARN(rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary search index."); - return BinarySearchIndexReader::Create(this, prefetch_buffer, - use_cache, prefetch, pin, - lookup_context, index_reader); + should_fallback = true; } meta_index_iter = metaindex_iter_guard.get(); } - return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, - use_cache, prefetch, pin, lookup_context, - index_reader); + if (should_fallback) { + return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } else { + return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, + use_cache, prefetch, pin, lookup_context, + index_reader); + } } default: { std::string error_message = diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index f54c45fff..a77a63df5 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -548,6 +548,7 @@ struct BlockBasedTable::Rep { // module should not be relying on db module. However to make things easier // and compatible with existing code, we introduce a wrapper that allows // block to extract prefix without knowing if a key is internal or not. + // null if no prefix_extractor is passed in when opening the table reader. std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor;