From ad135f3ffdf9a932000d289101867a43f101d5e4 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Mon, 13 Jun 2022 11:08:50 -0700
Subject: [PATCH] Document design/specification bugs with auto_prefix_mode
 (#10144)

Summary:
auto_prefix_mode is designed to use prefix filtering in a
particular "safe" set of cases where the upper bound and the seek key
have different prefixes: where the upper bound is the "same length
immediate successor". These conditions are not sufficient to guarantee
the same iteration results as total_order_seek if the DB contains
"short" keys, less than the "full" (maximum) prefix length.

We are not simply disabling the optimization in these successor cases
because it is likely that users are essentially getting what they want
out of existing usage. Especially if users are constructing successor
bounds with the intention of doing a prefix-bounded seek, the existing
behavior is more expected than the total_order_seek behavior.
Consequently, for now we reconcile the bad specification of behavior by
documenting the existing mismatch with total_order_seek.

A closely related issue affects hypothetical comparators like
ReverseBytewiseComparator: if they "correctly" implement
IsSameLengthImmediateSuccessor, auto_prefix_mode could omit more
entries (other than "short" keys noted above). Luckily, the built-in
ReverseBytewiseComparator has an "incorrect" implementation of
IsSameLengthImmediateSuccessor that effectively prevents prefix
optimization and, thus, the bug. This is now documented as a new
constraint on IsSameLengthImmediateSuccessor, and the implementation
tweaked to be simply "safe" rather than "incorrect".

This change also includes unit test updates to demonstrate the above
issues. (Test was cleaned up for readability and simplicity.)

Intended follow-up:
* Tweak documented axioms for prefix_extractor (more details then)
* Consider some sort of fix for this case. I don't know what that would
look like without breaking the performance of existing code. Perhaps
if all keys in an SST file have prefixes that are "full length," we can track
that fact and use it to allow optimization with the "same length
immediate successor", but that would only apply to new files.
* Consider a better system of specifying prefix bounds

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10144

Test Plan: test updates included

Reviewed By: siying

Differential Revision: D37052710

Pulled By: pdillinger

fbshipit-source-id: 5f63b7d65f3f214e4b143e0f9aa1749527c587db
---
 HISTORY.md                        |   2 +
 db/comparator_db_test.cc          |  46 +++++---
 db/db_test2.cc                    | 189 +++++++++++++++++++++++++-----
 include/rocksdb/comparator.h      |   4 +
 include/rocksdb/options.h         |  11 ++
 include/rocksdb/slice_transform.h |   5 +-
 util/comparator.cc                |  10 ++
 7 files changed, 222 insertions(+), 45 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 070a67b4d..736882d6c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -22,6 +22,8 @@
   * `rocksdb_comparator_with_ts_create` to create timestamp aware comparator
   * Put, Get, Delete, SingleDelete, MultiGet APIs has corresponding timestamp aware APIs with suffix `with_ts`
   * And Add C API's for Transaction, SstFileWriter, Compaction as mentioned [here](https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-(Experimental))
+* The contract for implementations of Comparator::IsSameLengthImmediateSuccessor has been updated to work around a design bug in `auto_prefix_mode`.
+* The API documentation for `auto_prefix_mode` now notes some corner cases in which it returns different results than `total_order_seek`, due to design bugs that are not easily fixed. Users using built-in comparators and keys at least the size of a fixed prefix length are not affected.
 
 ### New Features
 * Add FileSystem::ReadAsync API in io_tracing
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index c1473a0fe..34a4776f0 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -449,67 +449,85 @@ TEST_P(ComparatorDBTest, TwoStrComparator) {
   }
 }
 
+namespace {
+void VerifyNotSuccessor(const Slice& s, const Slice& t) {
+  auto bc = BytewiseComparator();
+  auto rbc = ReverseBytewiseComparator();
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+void VerifySuccessor(const Slice& s, const Slice& t) {
+  auto bc = BytewiseComparator();
+  auto rbc = ReverseBytewiseComparator();
+  ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+  // Should be true but that increases exposure to a design bug in
+  // auto_prefix_mode, so currently set to FALSE
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+}  // namespace
+
 TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
   {
     // different length
     Slice s("abcxy");
     Slice t("abcxyz");
-    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifyNotSuccessor(s, t);
   }
   {
     Slice s("abcxyz");
     Slice t("abcxy");
-    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifyNotSuccessor(s, t);
   }
   {
     // not last byte different
     Slice s("abc1xyz");
     Slice t("abc2xyz");
-    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifyNotSuccessor(s, t);
   }
   {
     // same string
     Slice s("abcxyz");
     Slice t("abcxyz");
-    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifyNotSuccessor(s, t);
   }
   {
     Slice s("abcxy");
     Slice t("abcxz");
-    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
-  }
-  {
-    Slice s("abcxz");
-    Slice t("abcxy");
-    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifySuccessor(s, t);
   }
   {
     const char s_array[] = "\x50\x8a\xac";
     const char t_array[] = "\x50\x8a\xad";
     Slice s(s_array);
     Slice t(t_array);
-    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifySuccessor(s, t);
   }
   {
     const char s_array[] = "\x50\x8a\xff";
     const char t_array[] = "\x50\x8b\x00";
     Slice s(s_array, 3);
     Slice t(t_array, 3);
-    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifySuccessor(s, t);
   }
   {
     const char s_array[] = "\x50\x8a\xff\xff";
     const char t_array[] = "\x50\x8b\x00\x00";
     Slice s(s_array, 4);
     Slice t(t_array, 4);
-    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifySuccessor(s, t);
   }
   {
     const char s_array[] = "\x50\x8a\xff\xff";
     const char t_array[] = "\x50\x8b\x00\x01";
     Slice s(s_array, 4);
     Slice t(t_array, 4);
-    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+    VerifyNotSuccessor(s, t);
   }
 }
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 493e01f35..22d2fbcf9 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -6376,86 +6376,85 @@ TEST_F(DBTest2, AutoPrefixMode1) {
     ReadOptions ro;
     ro.total_order_seek = false;
     ro.auto_prefix_mode = true;
+
+    const auto stat = BLOOM_FILTER_PREFIX_CHECKED;
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
       iterator->Seek("b1");
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("x1", iterator->key().ToString());
-      ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
       ASSERT_OK(iterator->status());
     }
 
-    std::string ub_str = "b9";
-    Slice ub(ub_str);
+    Slice ub;
     ro.iterate_upper_bound = &ub;
 
+    ub = "b9";
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
       iterator->Seek("b1");
       ASSERT_FALSE(iterator->Valid());
-      ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
       ASSERT_OK(iterator->status());
     }
 
-    ub_str = "z";
-    ub = Slice(ub_str);
+    ub = "z";
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
       iterator->Seek("b1");
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("x1", iterator->key().ToString());
-      ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
       ASSERT_OK(iterator->status());
     }
 
-    ub_str = "c";
-    ub = Slice(ub_str);
+    ub = "c";
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
       iterator->Seek("b1");
       ASSERT_FALSE(iterator->Valid());
-      ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
       ASSERT_OK(iterator->status());
     }
 
-    // The same queries without recreating iterator
+    ub = "c1";
     {
-      ub_str = "b9";
-      ub = Slice(ub_str);
-      ro.iterate_upper_bound = &ub;
-
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
       iterator->Seek("b1");
       ASSERT_FALSE(iterator->Valid());
-      ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
       ASSERT_OK(iterator->status());
+    }
 
-      ub_str = "z";
-      ub = Slice(ub_str);
+    // The same queries without recreating iterator
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      ub = "b9";
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
 
+      ub = "z";
       iterator->Seek("b1");
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("x1", iterator->key().ToString());
-      ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
-
-      ub_str = "c";
-      ub = Slice(ub_str);
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
 
+      ub = "c";
       iterator->Seek("b1");
       ASSERT_FALSE(iterator->Valid());
-      ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
 
-      ub_str = "b9";
-      ub = Slice(ub_str);
-      ro.iterate_upper_bound = &ub;
+      ub = "b9";
       iterator->SeekForPrev("b1");
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("a1", iterator->key().ToString());
-      ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
 
-      ub_str = "zz";
-      ub = Slice(ub_str);
-      ro.iterate_upper_bound = &ub;
+      ub = "zz";
       iterator->SeekToLast();
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("y1", iterator->key().ToString());
@@ -6464,6 +6463,136 @@ TEST_F(DBTest2, AutoPrefixMode1) {
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("a1", iterator->key().ToString());
     }
+
+    // Similar, now with reverse comparator
+    // Technically, we are violating axiom 2 of prefix_extractors, but
+    // it should be revised because of major use-cases using
+    // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME)
+    options.comparator = ReverseBytewiseComparator();
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("a1", large_value));
+    ASSERT_OK(Put("x1", large_value));
+    ASSERT_OK(Put("y1", large_value));
+    ASSERT_OK(Flush());
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      ub = "b1";
+      iterator->Seek("b9");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      ub = "b1";
+      iterator->Seek("z");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b1";
+      iterator->Seek("c");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b";
+      iterator->Seek("c9");
+      ASSERT_FALSE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "a";
+      iterator->Seek("b9");
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b";
+      iterator->Seek("a");
+      ASSERT_FALSE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper
+      // comparing before seek key prevents a real bug from surfacing.
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b1";
+      iterator->SeekForPrev("b9");
+      ASSERT_TRUE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "a";
+      iterator->SeekToLast();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+
+      iterator->SeekToFirst();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+    }
+
+    // Now something a bit different, related to "short" keys that
+    // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode.
+    options.comparator = BytewiseComparator();
+    for (const auto config : {"fixed:2", "capped:2"}) {
+      ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config,
+                                                 &options.prefix_extractor));
+
+      // FIXME: kHashSearch, etc. requires all keys be InDomain
+      if (StartsWith(config, "fixed") &&
+          (table_options.index_type == BlockBasedTableOptions::kHashSearch ||
+           StartsWith(options.memtable_factory->Name(), "Hash"))) {
+        continue;
+      }
+      DestroyAndReopen(options);
+
+      const char* a_end_stuff = "a\xffXYZ";
+      const char* b_begin_stuff = "b\x00XYZ";
+      ASSERT_OK(Put("a", large_value));
+      ASSERT_OK(Put("b", large_value));
+      ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value));
+      ASSERT_OK(Put("c", large_value));
+      ASSERT_OK(Flush());
+
+      // control showing valid optimization with auto_prefix mode
+      ub = Slice(a_end_stuff, 4);
+      ro.iterate_upper_bound = &ub;
+
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek(Slice(a_end_stuff, 2));
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      // test, cannot be validly optimized with auto_prefix_mode
+      ub = Slice(b_begin_stuff, 2);
+      ro.iterate_upper_bound = &ub;
+
+      iterator->Seek(Slice(a_end_stuff, 2));
+      // !!! BUG !!! See "BUG" section of auto_prefix_mode.
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      // To prove that is the wrong result, now use total order seek
+      ReadOptions tos_ro = ro;
+      tos_ro.total_order_seek = true;
+      tos_ro.auto_prefix_mode = false;
+      iterator.reset(db_->NewIterator(tos_ro));
+      iterator->Seek(Slice(a_end_stuff, 2));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("b", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
   } while (ChangeOptions(kSkipPlainTable));
 }
 
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index 4b1b61eb4..1215c819a 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -96,6 +96,10 @@ class Comparator : public Customizable {
   virtual const Comparator* GetRootComparator() const { return this; }
 
   // given two keys, determine if t is the successor of s
+  // BUG: only return true if no other keys starting with `t` are ordered
+  // before `t`. Otherwise, the auto_prefix_mode can omit entries within
+  // iterator bounds that have same prefix as upper bound but different
+  // prefix from seek key.
   virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
                                               const Slice& /*t*/) const {
     return false;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 8f54d063f..cc175dccc 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1530,6 +1530,17 @@ struct ReadOptions {
   // from total_order_seek, based on seek key, and iterator upper bound.
   // Not supported in ROCKSDB_LITE mode, in the way that even with value true
   // prefix mode is not used.
+  // BUG: Using Comparator::IsSameLengthImmediateSuccessor and
+  // SliceTransform::FullLengthEnabled to enable prefix mode in cases where
+  // prefix of upper bound differs from prefix of seek key has a flaw.
+  // If present in the DB, "short keys" (shorter than "full length" prefix)
+  // can be omitted from auto_prefix_mode iteration when they would be present
+  // in total_order_seek iteration, regardless of whether the short keys are
+  // "in domain" of the prefix extractor. This is not an issue if no short
+  // keys are added to DB or are not expected to be returned by such
+  // iterators. (We are also assuming the new condition on
+  // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
+  // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
   // Default: false
   bool auto_prefix_mode;
 
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
index 27bc5280b..8909b9c53 100644
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -85,7 +85,10 @@ class SliceTransform : public Customizable {
   // Otherwise (including FullLengthEnabled returns false, or prefix length is
   // less than maximum), Seek with auto_prefix_mode is only optimized if the
   // iterate_upper_bound and seek key have the same prefix.
-  // NOTE/TODO: We hope to revise this requirement in the future.
+  // BUG: Despite all these conditions and even with the extra condition on
+  // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not
+  // sufficient to ensure auto_prefix_mode returns all entries that
+  // total_order_seek would return. See auto_prefix_mode "BUG" section.
   virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
 
   // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
diff --git a/util/comparator.cc b/util/comparator.cc
index d04031e39..72584de43 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -209,6 +209,16 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
     // Don't do anything for simplicity.
   }
 
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    // Always returning false to prevent surfacing design flaws in
+    // auto_prefix_mode
+    (void)s, (void)t;
+    return false;
+    // "Correct" implementation:
+    // return BytewiseComparatorImpl::IsSameLengthImmediateSuccessor(t, s);
+  }
+
   bool CanKeysWithDifferentByteContentsBeEqual() const override {
     return false;
   }