From 8f2bee67478d21bb468577b14d6075fb8b6e2380 Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 28 Jan 2020 14:42:21 -0800 Subject: [PATCH] Add ReadOptions.auto_prefix_mode (#6314) Summary: Add a new option ReadOptions.auto_prefix_mode. When set to true, iterator should return the same result as total order seek, but may choose to do prefix seek internally, based on iterator upper bounds. Also fix two previous bugs when handling prefix extrator changes: (1) reverse iterator should not rely on upper bound to determine prefix. Fix it with skipping prefix check. (2) block-based filter is not handled properly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6314 Test Plan: (1) add a unit test; (2) add the check to stress test and run see whether it can pass at least one run. Differential Revision: D19458717 fbshipit-source-id: 51c1bcc5cdd826c2469af201979a39600e779bce --- HISTORY.md | 1 + db/db_iter.cc | 19 ++-- db/db_iter.h | 9 +- db/db_test2.cc | 106 ++++++++++++++++++ db/memtable.cc | 4 +- db/version_set.cc | 4 +- db_stress_tool/db_stress_test_base.cc | 22 ++-- include/rocksdb/options.h | 5 + options/options.cc | 2 + table/block_based/block_based_table_reader.cc | 7 +- table/block_based/block_based_table_reader.h | 39 ++++--- table/block_based/filter_block.h | 6 +- table/plain/plain_table_reader.cc | 4 +- 13 files changed, 189 insertions(+), 39 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 384135e62..723c48773 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -25,6 +25,7 @@ * It is now possible to enable periodic compactions for the base DB when using BlobDB. * BlobDB now garbage collects non-TTL blobs when `enable_garbage_collection` is set to `true` in `BlobDBOptions`. Garbage collection is performed during compaction: any valid blobs located in the oldest N files (where N is the number of non-TTL blob files multiplied by the value of `BlobDBOptions::garbage_collection_cutoff`) encountered during compaction get relocated to new blob files, and old blob files are dropped once they are no longer needed. Note: we recommend enabling periodic compactions for the base DB when using this feature to deal with the case when some old blob files are kept alive by SSTs that otherwise do not get picked for compaction. * `db_bench` now supports the `garbage_collection_cutoff` option for BlobDB. +* Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound. ## 6.6.2 (01/13/2020) ### Bug Fixes diff --git a/db/db_iter.cc b/db/db_iter.cc index b2675c520..0e181d9bf 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -73,7 +73,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, ? read_options.prefix_same_as_start : false), pin_thru_lifetime_(read_options.pin_data), - total_order_seek_(read_options.total_order_seek), + expect_total_order_inner_iter_(prefix_extractor_ == nullptr || + read_options.total_order_seek || + read_options.auto_prefix_mode), allow_blob_(allow_blob), is_blob_(false), arena_mode_(arena_mode), @@ -567,7 +569,7 @@ bool DBIter::ReverseToForward() { // When moving backwards, iter_ is positioned on _previous_ key, which may // not exist or may have different prefix than the current key(). // If that's the case, seek iter_ to current key. - if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_.Valid()) { + if (!expect_total_order_inner_iter() || !iter_.Valid()) { IterKey last_key; last_key.SetInternalKey(ParsedInternalKey( saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); @@ -603,15 +605,14 @@ bool DBIter::ReverseToBackward() { // key, which may not exist or may have prefix different from current. // If that's the case, seek to saved_key_. if (current_entry_is_merged_ && - ((prefix_extractor_ != nullptr && !total_order_seek_) || - !iter_.Valid())) { + (!expect_total_order_inner_iter() || !iter_.Valid())) { IterKey last_key; // Using kMaxSequenceNumber and kValueTypeForSeek // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller // than saved_key_. last_key.SetInternalKey(ParsedInternalKey( saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); - if (prefix_extractor_ != nullptr && !total_order_seek_) { + if (!expect_total_order_inner_iter()) { iter_.SeekForPrev(last_key.GetInternalKey()); } else { // Some iterators may not support SeekForPrev(), so we avoid using it @@ -978,8 +979,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { // Make sure we leave iter_ in a good state. If it's valid and we don't care // about prefixes, that's already good enough. Otherwise it needs to be // seeked to the current key. - if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_.Valid()) { - if (prefix_extractor_ != nullptr && !total_order_seek_) { + if (!expect_total_order_inner_iter() || !iter_.Valid()) { + if (!expect_total_order_inner_iter()) { iter_.SeekForPrev(last_key); } else { iter_.Seek(last_key); @@ -1224,7 +1225,7 @@ void DBIter::SeekToFirst() { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. - if (prefix_extractor_ != nullptr && !total_order_seek_) { + if (!expect_total_order_inner_iter()) { max_skip_ = std::numeric_limits::max(); } status_ = Status::OK(); @@ -1277,7 +1278,7 @@ void DBIter::SeekToLast() { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. - if (prefix_extractor_ != nullptr && !total_order_seek_) { + if (!expect_total_order_inner_iter()) { max_skip_ = std::numeric_limits::max(); } status_ = Status::OK(); diff --git a/db/db_iter.h b/db/db_iter.h index e6e072c50..39e52fe94 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -257,6 +257,11 @@ class DBIter final : public Iterator { num_internal_keys_skipped_ = 0; } + bool expect_total_order_inner_iter() { + assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr); + return expect_total_order_inner_iter_; + } + const SliceTransform* prefix_extractor_; Env* const env_; Logger* logger_; @@ -302,7 +307,9 @@ class DBIter final : public Iterator { // Means that we will pin all data blocks we read as long the Iterator // is not deleted, will be true if ReadOptions::pin_data is true const bool pin_thru_lifetime_; - const bool total_order_seek_; + // Expect the inner iterator to maintain a total order. + // prefix_extractor_ must be non-NULL if the value is false. + const bool expect_total_order_inner_iter_; bool allow_blob_; bool is_blob_; bool arena_mode_; diff --git a/db/db_test2.cc b/db/db_test2.cc index 7ff8cd796..d9b2250d4 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -4404,6 +4404,112 @@ TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) { ASSERT_EQ("ok", Get("b1")); } +TEST_F(DBTest2, AutoPrefixMode1) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = CreateDBStatistics(); + + Reopen(options); + + Random rnd(301); + std::string large_value = RandomString(&rnd, 500); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + Flush(); + + ReadOptions ro; + ro.total_order_seek = false; + ro.auto_prefix_mode = true; + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + std::string ub_str = "b9"; + Slice ub(ub_str); + ro.iterate_upper_bound = &ub; + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + ub_str = "z"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + ub_str = "c"; + ub = Slice(ub_str); + { + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + } + + // The same queries without recreating iterator + { + ub_str = "b9"; + ub = Slice(ub_str); + ro.iterate_upper_bound = &ub; + + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "z"; + ub = Slice(ub_str); + + iterator->Seek("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "c"; + ub = Slice(ub_str); + + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "b9"; + ub = Slice(ub_str); + ro.iterate_upper_bound = &ub; + iterator->SeekForPrev("b1"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + + ub_str = "zz"; + ub = Slice(ub_str); + ro.iterate_upper_bound = &ub; + iterator->SeekToLast(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + + iterator->SeekToFirst(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + } +} } // namespace rocksdb #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/memtable.cc b/db/memtable.cc index e3c531c31..c19656ccd 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -288,7 +288,9 @@ class MemTableIterator : public InternalIterator { !mem.GetImmutableMemTableOptions()->inplace_update_support) { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); - } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { + } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && + !read_options.auto_prefix_mode) { + // Auto prefix mode is not implemented in memtable yet. bloom_ = mem.bloom_filter_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { diff --git a/db/version_set.cc b/db/version_set.cc index 217b7bad3..5ac0bb0d5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1047,8 +1047,8 @@ void LevelIterator::Seek(const Slice& target) { file_iter_.Seek(target); } if (SkipEmptyFileForward() && prefix_extractor_ != nullptr && - !read_options_.total_order_seek && file_iter_.iter() != nullptr && - file_iter_.Valid()) { + !read_options_.total_order_seek && !read_options_.auto_prefix_mode && + file_iter_.iter() != nullptr && file_iter_.Valid()) { // We've skipped the file we initially positioned to. In the prefix // seek case, it is likely that the file is skipped because of // prefix bloom or hash, where more keys are skipped. We then check diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index dd9f00c87..6ed5f1536 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -803,9 +803,17 @@ Status StressTest::TestIterate(ThreadState* thread, ReadOptions readoptionscopy = read_opts; readoptionscopy.snapshot = snapshot; + bool expect_total_order = false; if (thread->rand.OneIn(16)) { // When prefix extractor is used, it's useful to cover total order seek. readoptionscopy.total_order_seek = true; + expect_total_order = true; + } else if (thread->rand.OneIn(4)) { + readoptionscopy.total_order_seek = false; + readoptionscopy.auto_prefix_mode = true; + expect_total_order = true; + } else if (options_.prefix_extractor.get() == nullptr) { + expect_total_order = true; } std::string upper_bound_str; @@ -879,6 +887,8 @@ Status StressTest::TestIterate(ThreadState* thread, // Record some options to op_logs; op_logs += "total_order_seek: "; op_logs += (readoptionscopy.total_order_seek ? "1 " : "0 "); + op_logs += "auto_prefix_mode: "; + op_logs += (readoptionscopy.auto_prefix_mode ? "1 " : "0 "); if (readoptionscopy.iterate_upper_bound != nullptr) { op_logs += "ub: " + upper_bound.ToString(true) + " "; } @@ -899,9 +909,7 @@ Status StressTest::TestIterate(ThreadState* thread, std::unique_ptr cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh)); bool diverged = false; - bool support_seek_first_or_last = - (options_.prefix_extractor.get() != nullptr) || - readoptionscopy.total_order_seek; + bool support_seek_first_or_last = expect_total_order; LastIterateOp last_op; if (support_seek_first_or_last && thread->rand.OneIn(100)) { @@ -929,8 +937,7 @@ Status StressTest::TestIterate(ThreadState* thread, last_op, key, op_logs, &diverged); bool no_reverse = - (FLAGS_memtablerep == "prefix_hash" && !read_opts.total_order_seek && - options_.prefix_extractor.get() != nullptr); + (FLAGS_memtablerep == "prefix_hash" && !expect_total_order); for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) { if (no_reverse || thread->rand.OneIn(2)) { iter->Next(); @@ -1040,8 +1047,9 @@ void StressTest::VerifyIterator(ThreadState* thread, return; } - const SliceTransform* pe = - ro.total_order_seek ? nullptr : options_.prefix_extractor.get(); + const SliceTransform* pe = (ro.total_order_seek || ro.auto_prefix_mode) + ? nullptr + : options_.prefix_extractor.get(); const Comparator* cmp = options_.comparator; if (iter->Valid() && !cmp_iter->Valid()) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 86e8f8280..13ebb382b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1251,6 +1251,11 @@ struct ReadOptions { // changing implementation of prefix extractor. bool total_order_seek; + // When true, by default use total_order_seek = true, and RocksDB can + // selectively enable prefix seek mode if won't generate a different result + // from total_order_seek, based on seek key, and iterator upper bound. + bool auto_prefix_mode; + // Enforce that the iterator only iterates over the same prefix as the seek. // This option is effective only for prefix seeks, i.e. prefix_extractor is // non-null for the column family and total_order_seek is false. Unlike diff --git a/options/options.cc b/options/options.cc index db1cd7130..ffa0afe89 100644 --- a/options/options.cc +++ b/options/options.cc @@ -592,6 +592,7 @@ ReadOptions::ReadOptions() tailing(false), managed(false), total_order_seek(false), + auto_prefix_mode(false), prefix_same_as_start(false), pin_data(false), background_purge_on_iterator_cleanup(false), @@ -611,6 +612,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache) tailing(false), managed(false), total_order_seek(false), + auto_prefix_mode(false), prefix_same_as_start(false), pin_data(false), background_purge_on_iterator_cleanup(false), diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 772a88a36..336669f1b 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2788,7 +2788,7 @@ void BlockBasedTableIterator::SeekImpl( const Slice* target) { is_out_of_bound_ = false; is_at_first_key_from_index_ = false; - if (target && !CheckPrefixMayMatch(*target)) { + if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { ResetDataIter(); return; } @@ -2881,7 +2881,9 @@ void BlockBasedTableIterator::SeekForPrev( const Slice& target) { is_out_of_bound_ = false; is_at_first_key_from_index_ = false; - if (!CheckPrefixMayMatch(target)) { + // For now totally disable prefix seek in auto prefix mode because we don't + // have logic + if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { ResetDataIter(); return; } @@ -3202,6 +3204,7 @@ InternalIterator* BlockBasedTable::NewIterator( size_t compaction_readahead_size) { BlockCacheLookupContext lookup_context{caller}; bool need_upper_bound_check = + read_options.auto_prefix_mode || PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); if (arena == nullptr) { return new BlockBasedTableIterator( diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index a77a63df5..998a5b579 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -725,20 +725,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { block_iter_points_to_real_block_; } - bool CheckPrefixMayMatch(const Slice& ikey) { - if (check_filter_ && - !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, - need_upper_bound_check_, &lookup_context_)) { - // TODO remember the iterator is invalidated because of prefix - // match. This can avoid the upper level file iterator to falsely - // believe the position is the end of the SST file and move to - // the first key of the next file. - ResetDataIter(); - return false; - } - return true; - } - void ResetDataIter() { if (block_iter_points_to_real_block_) { if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { @@ -758,6 +744,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { } private: + enum class IterDirection { + kForward, + kBackward, + }; + const BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; @@ -807,6 +798,26 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Note MyRocks may update iterate bounds between seek. To workaround it, // we need to check and update data_block_within_upper_bound_ accordingly. void CheckDataBlockWithinUpperBound(); + + bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { + if (need_upper_bound_check_ && direction == IterDirection::kBackward) { + // Upper bound check isn't sufficnet for backward direction to + // guarantee the same result as total order, so disable prefix + // check. + return true; + } + if (check_filter_ && + !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_, &lookup_context_)) { + // TODO remember the iterator is invalidated because of prefix + // match. This can avoid the upper level file iterator to falsely + // believe the position is the end of the SST file and move to + // the first key of the next file. + ResetDataIter(); + return false; + } + return true; + } }; } // namespace rocksdb diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 90766b114..befc5e30c 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -160,9 +160,11 @@ class FilterBlockReader { const SliceTransform* prefix_extractor, const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, - bool* filter_checked, - bool /*need_upper_bound_check*/, + bool* filter_checked, bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) { + if (need_upper_bound_check) { + return true; + } *filter_checked = true; Slice prefix = prefix_extractor->Transform(user_key); return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 15cd32d0b..2b5ed9b9b 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -205,7 +205,9 @@ InternalIterator* PlainTableReader::NewIterator( // Not necessarily used here, but make sure this has been initialized assert(table_properties_); - bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; + // Auto prefix mode is not implemented in PlainTable. + bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && + !options.auto_prefix_mode; if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); } else {