Add ReadOptions.auto_prefix_mode (#6314)

Summary: Add a new option ReadOptions.auto_prefix_mode. When set to true, iterator should return the same result as total order seek, but may choose to do prefix seek internally, based on iterator upper bounds. Also fix two previous bugs when handling prefix extrator changes: (1) reverse iterator should not rely on upper bound to determine prefix. Fix it with skipping prefix check. (2) block-based filter is not handled properly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6314 Test Plan: (1) add a unit test; (2) add the check to stress test and run see whether it can pass at least one run. Differential Revision: D19458717 fbshipit-source-id: 51c1bcc5cdd826c2469af201979a39600e779bce
6 years ago · 8f2bee6747
parent 431fb6c0ba
commit 8f2bee6747
13 changed files with 189 additions and 39 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -25,6 +25,7 @@
 * It is now possible to enable periodic compactions for the base DB when using BlobDB.
 * BlobDB now garbage collects non-TTL blobs when `enable_garbage_collection` is set to `true` in `BlobDBOptions`. Garbage collection is performed during compaction: any valid blobs located in the oldest N files (where N is the number of non-TTL blob files multiplied by the value of `BlobDBOptions::garbage_collection_cutoff`) encountered during compaction get relocated to new blob files, and old blob files are dropped once they are no longer needed. Note: we recommend enabling periodic compactions for the base DB when using this feature to deal with the case when some old blob files are kept alive by SSTs that otherwise do not get picked for compaction.
 * `db_bench` now supports the `garbage_collection_cutoff` option for BlobDB.
+* Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound.

 ## 6.6.2 (01/13/2020)
 ### Bug Fixes
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -73,7 +73,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
                                ? read_options.prefix_same_as_start
                                : false),
      pin_thru_lifetime_(read_options.pin_data),
-      total_order_seek_(read_options.total_order_seek),
+      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
+                                     read_options.total_order_seek ||
+                                     read_options.auto_prefix_mode),
      allow_blob_(allow_blob),
      is_blob_(false),
      arena_mode_(arena_mode),
@ -567,7 +569,7 @@ bool DBIter::ReverseToForward() {
  // When moving backwards, iter_ is positioned on _previous_ key, which may
  // not exist or may have different prefix than the current key().
  // If that's the case, seek iter_ to current key.
-  if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_.Valid()) {
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
    IterKey last_key;
    last_key.SetInternalKey(ParsedInternalKey(
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
@ -603,15 +605,14 @@ bool DBIter::ReverseToBackward() {
  // key, which may not exist or may have prefix different from current.
  // If that's the case, seek to saved_key_.
  if (current_entry_is_merged_ &&
-      ((prefix_extractor_ != nullptr && !total_order_seek_) ||
-       !iter_.Valid())) {
+      (!expect_total_order_inner_iter() || !iter_.Valid())) {
    IterKey last_key;
    // Using kMaxSequenceNumber and kValueTypeForSeek
    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
    // than saved_key_.
    last_key.SetInternalKey(ParsedInternalKey(
        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
-    if (prefix_extractor_ != nullptr && !total_order_seek_) {
+    if (!expect_total_order_inner_iter()) {
      iter_.SeekForPrev(last_key.GetInternalKey());
    } else {
      // Some iterators may not support SeekForPrev(), so we avoid using it
@ -978,8 +979,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
  // Make sure we leave iter_ in a good state. If it's valid and we don't care
  // about prefixes, that's already good enough. Otherwise it needs to be
  // seeked to the current key.
-  if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_.Valid()) {
-    if (prefix_extractor_ != nullptr && !total_order_seek_) {
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    if (!expect_total_order_inner_iter()) {
      iter_.SeekForPrev(last_key);
    } else {
      iter_.Seek(last_key);
@ -1224,7 +1225,7 @@ void DBIter::SeekToFirst() {
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek will be used.
-  if (prefix_extractor_ != nullptr && !total_order_seek_) {
+  if (!expect_total_order_inner_iter()) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  status_ = Status::OK();
@ -1277,7 +1278,7 @@ void DBIter::SeekToLast() {
  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek will be used.
-  if (prefix_extractor_ != nullptr && !total_order_seek_) {
+  if (!expect_total_order_inner_iter()) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  status_ = Status::OK();
--- a/db/db_iter.h
+++ b/db/db_iter.h
@ -257,6 +257,11 @@ class DBIter final : public Iterator {
    num_internal_keys_skipped_ = 0;
  }

+  bool expect_total_order_inner_iter() {
+    assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
+    return expect_total_order_inner_iter_;
+  }
+
  const SliceTransform* prefix_extractor_;
  Env* const env_;
  Logger* logger_;
@ -302,7 +307,9 @@ class DBIter final : public Iterator {
  // Means that we will pin all data blocks we read as long the Iterator
  // is not deleted, will be true if ReadOptions::pin_data is true
  const bool pin_thru_lifetime_;
-  const bool total_order_seek_;
+  // Expect the inner iterator to maintain a total order.
+  // prefix_extractor_ must be non-NULL if the value is false.
+  const bool expect_total_order_inner_iter_;
  bool allow_blob_;
  bool is_blob_;
  bool arena_mode_;
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -4404,6 +4404,112 @@ TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
  ASSERT_EQ("ok", Get("b1"));
 }

+TEST_F(DBTest2, AutoPrefixMode1) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  Random rnd(301);
+  std::string large_value = RandomString(&rnd, 500);
+
+  ASSERT_OK(Put("a1", large_value));
+  ASSERT_OK(Put("x1", large_value));
+  ASSERT_OK(Put("y1", large_value));
+  Flush();
+
+  ReadOptions ro;
+  ro.total_order_seek = false;
+  ro.auto_prefix_mode = true;
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  std::string ub_str = "b9";
+  Slice ub(ub_str);
+  ro.iterate_upper_bound = &ub;
+
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  ub_str = "z";
+  ub = Slice(ub_str);
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  ub_str = "c";
+  ub = Slice(ub_str);
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+  }
+
+  // The same queries without recreating iterator
+  {
+    ub_str = "b9";
+    ub = Slice(ub_str);
+    ro.iterate_upper_bound = &ub;
+
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "z";
+    ub = Slice(ub_str);
+
+    iterator->Seek("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "c";
+    ub = Slice(ub_str);
+
+    iterator->Seek("b1");
+    ASSERT_FALSE(iterator->Valid());
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "b9";
+    ub = Slice(ub_str);
+    ro.iterate_upper_bound = &ub;
+    iterator->SeekForPrev("b1");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+    ub_str = "zz";
+    ub = Slice(ub_str);
+    ro.iterate_upper_bound = &ub;
+    iterator->SeekToLast();
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+
+    iterator->SeekToFirst();
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+  }
+}
 }  // namespace rocksdb

 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -288,7 +288,9 @@ class MemTableIterator : public InternalIterator {
            !mem.GetImmutableMemTableOptions()->inplace_update_support) {
    if (use_range_del_table) {
      iter_ = mem.range_del_table_->GetIterator(arena);
-    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
+    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
+               !read_options.auto_prefix_mode) {
+      // Auto prefix mode is not implemented in memtable yet.
      bloom_ = mem.bloom_filter_.get();
      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
    } else {
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -1047,8 +1047,8 @@ void LevelIterator::Seek(const Slice& target) {
    file_iter_.Seek(target);
  }
  if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
-      !read_options_.total_order_seek && file_iter_.iter() != nullptr &&
-      file_iter_.Valid()) {
+      !read_options_.total_order_seek && !read_options_.auto_prefix_mode &&
+      file_iter_.iter() != nullptr && file_iter_.Valid()) {
    // We've skipped the file we initially positioned to. In the prefix
    // seek case, it is likely that the file is skipped because of
    // prefix bloom or hash, where more keys are skipped. We then check
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@ -803,9 +803,17 @@ Status StressTest::TestIterate(ThreadState* thread,
  ReadOptions readoptionscopy = read_opts;
  readoptionscopy.snapshot = snapshot;

+  bool expect_total_order = false;
  if (thread->rand.OneIn(16)) {
    // When prefix extractor is used, it's useful to cover total order seek.
    readoptionscopy.total_order_seek = true;
+    expect_total_order = true;
+  } else if (thread->rand.OneIn(4)) {
+    readoptionscopy.total_order_seek = false;
+    readoptionscopy.auto_prefix_mode = true;
+    expect_total_order = true;
+  } else if (options_.prefix_extractor.get() == nullptr) {
+    expect_total_order = true;
  }

  std::string upper_bound_str;
@ -879,6 +887,8 @@ Status StressTest::TestIterate(ThreadState* thread,
    // Record some options to op_logs;
    op_logs += "total_order_seek: ";
    op_logs += (readoptionscopy.total_order_seek ? "1 " : "0 ");
+    op_logs += "auto_prefix_mode: ";
+    op_logs += (readoptionscopy.auto_prefix_mode ? "1 " : "0 ");
    if (readoptionscopy.iterate_upper_bound != nullptr) {
      op_logs += "ub: " + upper_bound.ToString(true) + " ";
    }
@ -899,9 +909,7 @@ Status StressTest::TestIterate(ThreadState* thread,
    std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
    bool diverged = false;

-    bool support_seek_first_or_last =
-        (options_.prefix_extractor.get() != nullptr) ||
-        readoptionscopy.total_order_seek;
+    bool support_seek_first_or_last = expect_total_order;

    LastIterateOp last_op;
    if (support_seek_first_or_last && thread->rand.OneIn(100)) {
@ -929,8 +937,7 @@ Status StressTest::TestIterate(ThreadState* thread,
                   last_op, key, op_logs, &diverged);

    bool no_reverse =
-        (FLAGS_memtablerep == "prefix_hash" && !read_opts.total_order_seek &&
-         options_.prefix_extractor.get() != nullptr);
+        (FLAGS_memtablerep == "prefix_hash" && !expect_total_order);
    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
      if (no_reverse || thread->rand.OneIn(2)) {
        iter->Next();
@ -1040,8 +1047,9 @@ void StressTest::VerifyIterator(ThreadState* thread,
    return;
  }

-  const SliceTransform* pe =
-      ro.total_order_seek ? nullptr : options_.prefix_extractor.get();
+  const SliceTransform* pe = (ro.total_order_seek || ro.auto_prefix_mode)
+                                 ? nullptr
+                                 : options_.prefix_extractor.get();
  const Comparator* cmp = options_.comparator;

  if (iter->Valid() && !cmp_iter->Valid()) {
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -1251,6 +1251,11 @@ struct ReadOptions {
  // changing implementation of prefix extractor.
  bool total_order_seek;

+  // When true, by default use total_order_seek = true, and RocksDB can
+  // selectively enable prefix seek mode if won't generate a different result
+  // from total_order_seek, based on seek key, and iterator upper bound.
+  bool auto_prefix_mode;
+
  // Enforce that the iterator only iterates over the same prefix as the seek.
  // This option is effective only for prefix seeks, i.e. prefix_extractor is
  // non-null for the column family and total_order_seek is false.  Unlike
--- a/options/options.cc
+++ b/options/options.cc
@ -592,6 +592,7 @@ ReadOptions::ReadOptions()
      tailing(false),
      managed(false),
      total_order_seek(false),
+      auto_prefix_mode(false),
      prefix_same_as_start(false),
      pin_data(false),
      background_purge_on_iterator_cleanup(false),
@ -611,6 +612,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
      tailing(false),
      managed(false),
      total_order_seek(false),
+      auto_prefix_mode(false),
      prefix_same_as_start(false),
      pin_data(false),
      background_purge_on_iterator_cleanup(false),
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@ -2788,7 +2788,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
    const Slice* target) {
  is_out_of_bound_ = false;
  is_at_first_key_from_index_ = false;
-  if (target && !CheckPrefixMayMatch(*target)) {
+  if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
    ResetDataIter();
    return;
  }
@ -2881,7 +2881,9 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
    const Slice& target) {
  is_out_of_bound_ = false;
  is_at_first_key_from_index_ = false;
-  if (!CheckPrefixMayMatch(target)) {
+  // For now totally disable prefix seek in auto prefix mode because we don't
+  // have logic
+  if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
    ResetDataIter();
    return;
  }
@ -3202,6 +3204,7 @@ InternalIterator* BlockBasedTable::NewIterator(
    size_t compaction_readahead_size) {
  BlockCacheLookupContext lookup_context{caller};
  bool need_upper_bound_check =
+      read_options.auto_prefix_mode ||
      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
  if (arena == nullptr) {
    return new BlockBasedTableIterator<DataBlockIter>(
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@ -725,20 +725,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
           block_iter_points_to_real_block_;
  }

-  bool CheckPrefixMayMatch(const Slice& ikey) {
-    if (check_filter_ &&
-        !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
-                                need_upper_bound_check_, &lookup_context_)) {
-      // TODO remember the iterator is invalidated because of prefix
-      // match. This can avoid the upper level file iterator to falsely
-      // believe the position is the end of the SST file and move to
-      // the first key of the next file.
-      ResetDataIter();
-      return false;
-    }
-    return true;
-  }
-
  void ResetDataIter() {
    if (block_iter_points_to_real_block_) {
      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
@ -758,6 +744,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
  }

 private:
+  enum class IterDirection {
+    kForward,
+    kBackward,
+  };
+
  const BlockBasedTable* table_;
  const ReadOptions read_options_;
  const InternalKeyComparator& icomp_;
@ -807,6 +798,26 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
  // Note MyRocks may update iterate bounds between seek. To workaround it,
  // we need to check and update data_block_within_upper_bound_ accordingly.
  void CheckDataBlockWithinUpperBound();
+
+  bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
+    if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
+      // Upper bound check isn't sufficnet for backward direction to
+      // guarantee the same result as total order, so disable prefix
+      // check.
+      return true;
+    }
+    if (check_filter_ &&
+        !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
+                                need_upper_bound_check_, &lookup_context_)) {
+      // TODO remember the iterator is invalidated because of prefix
+      // match. This can avoid the upper level file iterator to falsely
+      // believe the position is the end of the SST file and move to
+      // the first key of the next file.
+      ResetDataIter();
+      return false;
+    }
+    return true;
+  }
 };

 }  // namespace rocksdb
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@ -160,9 +160,11 @@ class FilterBlockReader {
                             const SliceTransform* prefix_extractor,
                             const Comparator* /*comparator*/,
                             const Slice* const const_ikey_ptr,
-                             bool* filter_checked,
-                             bool /*need_upper_bound_check*/,
+                             bool* filter_checked, bool need_upper_bound_check,
                             BlockCacheLookupContext* lookup_context) {
+    if (need_upper_bound_check) {
+      return true;
+    }
    *filter_checked = true;
    Slice prefix = prefix_extractor->Transform(user_key);
    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@ -205,7 +205,9 @@ InternalIterator* PlainTableReader::NewIterator(
  // Not necessarily used here, but make sure this has been initialized
  assert(table_properties_);

-  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
+  // Auto prefix mode is not implemented in PlainTable.
+  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
+                         !options.auto_prefix_mode;
  if (arena == nullptr) {
    return new PlainTableIterator(this, use_prefix_seek);
  } else {