diff --git a/HISTORY.md b/HISTORY.md index 070a67b4d..736882d6c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,8 @@ * `rocksdb_comparator_with_ts_create` to create timestamp aware comparator * Put, Get, Delete, SingleDelete, MultiGet APIs has corresponding timestamp aware APIs with suffix `with_ts` * And Add C API's for Transaction, SstFileWriter, Compaction as mentioned [here](https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-(Experimental)) +* The contract for implementations of Comparator::IsSameLengthImmediateSuccessor has been updated to work around a design bug in `auto_prefix_mode`. +* The API documentation for `auto_prefix_mode` now notes some corner cases in which it returns different results than `total_order_seek`, due to design bugs that are not easily fixed. Users using built-in comparators and keys at least the size of a fixed prefix length are not affected. ### New Features * Add FileSystem::ReadAsync API in io_tracing diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index c1473a0fe..34a4776f0 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -449,67 +449,85 @@ TEST_P(ComparatorDBTest, TwoStrComparator) { } } +namespace { +void VerifyNotSuccessor(const Slice& s, const Slice& t) { + auto bc = BytewiseComparator(); + auto rbc = ReverseBytewiseComparator(); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); +} + +void VerifySuccessor(const Slice& s, const Slice& t) { + auto bc = BytewiseComparator(); + auto rbc = ReverseBytewiseComparator(); + ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s)); + // Should be true but that increases exposure to a design bug in + // auto_prefix_mode, so currently set to FALSE + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); +} + +} // namespace + TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { { // different length Slice s("abcxy"); Slice t("abcxyz"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { Slice s("abcxyz"); Slice t("abcxy"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { // not last byte different Slice s("abc1xyz"); Slice t("abc2xyz"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { // same string Slice s("abcxyz"); Slice t("abcxyz"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { Slice s("abcxy"); Slice t("abcxz"); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); - } - { - Slice s("abcxz"); - Slice t("abcxy"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xac"; const char t_array[] = "\x50\x8a\xad"; Slice s(s_array); Slice t(t_array); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xff"; const char t_array[] = "\x50\x8b\x00"; Slice s(s_array, 3); Slice t(t_array, 3); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xff\xff"; const char t_array[] = "\x50\x8b\x00\x00"; Slice s(s_array, 4); Slice t(t_array, 4); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xff\xff"; const char t_array[] = "\x50\x8b\x00\x01"; Slice s(s_array, 4); Slice t(t_array, 4); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } } diff --git a/db/db_test2.cc b/db/db_test2.cc index 493e01f35..22d2fbcf9 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -6376,86 +6376,85 @@ TEST_F(DBTest2, AutoPrefixMode1) { ReadOptions ro; ro.total_order_seek = false; ro.auto_prefix_mode = true; + + const auto stat = BLOOM_FILTER_PREFIX_CHECKED; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - std::string ub_str = "b9"; - Slice ub(ub_str); + Slice ub; ro.iterate_upper_bound = &ub; + ub = "b9"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - ub_str = "z"; - ub = Slice(ub_str); + ub = "z"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - ub_str = "c"; - ub = Slice(ub_str); + ub = "c"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - // The same queries without recreating iterator + ub = "c1"; { - ub_str = "b9"; - ub = Slice(ub_str); - ro.iterate_upper_bound = &ub; - std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); + } - ub_str = "z"; - ub = Slice(ub_str); + // The same queries without recreating iterator + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + ub = "b9"; + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + ub = "z"; iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); - - ub_str = "c"; - ub = Slice(ub_str); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ub = "c"; iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); - ub_str = "b9"; - ub = Slice(ub_str); - ro.iterate_upper_bound = &ub; + ub = "b9"; iterator->SeekForPrev("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); - ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); - ub_str = "zz"; - ub = Slice(ub_str); - ro.iterate_upper_bound = &ub; + ub = "zz"; iterator->SeekToLast(); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); @@ -6464,6 +6463,136 @@ TEST_F(DBTest2, AutoPrefixMode1) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); } + + // Similar, now with reverse comparator + // Technically, we are violating axiom 2 of prefix_extractors, but + // it should be revised because of major use-cases using + // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME) + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + ASSERT_OK(Flush()); + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + ub = "b1"; + iterator->Seek("b9"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + ub = "b1"; + iterator->Seek("z"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b1"; + iterator->Seek("c"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b"; + iterator->Seek("c9"); + ASSERT_FALSE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "a"; + iterator->Seek("b9"); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b"; + iterator->Seek("a"); + ASSERT_FALSE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper + // comparing before seek key prevents a real bug from surfacing. + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b1"; + iterator->SeekForPrev("b9"); + ASSERT_TRUE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + ASSERT_EQ("x1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "a"; + iterator->SeekToLast(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + + iterator->SeekToFirst(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + } + + // Now something a bit different, related to "short" keys that + // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode. + options.comparator = BytewiseComparator(); + for (const auto config : {"fixed:2", "capped:2"}) { + ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config, + &options.prefix_extractor)); + + // FIXME: kHashSearch, etc. requires all keys be InDomain + if (StartsWith(config, "fixed") && + (table_options.index_type == BlockBasedTableOptions::kHashSearch || + StartsWith(options.memtable_factory->Name(), "Hash"))) { + continue; + } + DestroyAndReopen(options); + + const char* a_end_stuff = "a\xffXYZ"; + const char* b_begin_stuff = "b\x00XYZ"; + ASSERT_OK(Put("a", large_value)); + ASSERT_OK(Put("b", large_value)); + ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value)); + ASSERT_OK(Put("c", large_value)); + ASSERT_OK(Flush()); + + // control showing valid optimization with auto_prefix mode + ub = Slice(a_end_stuff, 4); + ro.iterate_upper_bound = &ub; + + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek(Slice(a_end_stuff, 2)); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + // test, cannot be validly optimized with auto_prefix_mode + ub = Slice(b_begin_stuff, 2); + ro.iterate_upper_bound = &ub; + + iterator->Seek(Slice(a_end_stuff, 2)); + // !!! BUG !!! See "BUG" section of auto_prefix_mode. + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + // To prove that is the wrong result, now use total order seek + ReadOptions tos_ro = ro; + tos_ro.total_order_seek = true; + tos_ro.auto_prefix_mode = false; + iterator.reset(db_->NewIterator(tos_ro)); + iterator->Seek(Slice(a_end_stuff, 2)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("b", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } } while (ChangeOptions(kSkipPlainTable)); } diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 4b1b61eb4..1215c819a 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -96,6 +96,10 @@ class Comparator : public Customizable { virtual const Comparator* GetRootComparator() const { return this; } // given two keys, determine if t is the successor of s + // BUG: only return true if no other keys starting with `t` are ordered + // before `t`. Otherwise, the auto_prefix_mode can omit entries within + // iterator bounds that have same prefix as upper bound but different + // prefix from seek key. virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, const Slice& /*t*/) const { return false; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 8f54d063f..cc175dccc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1530,6 +1530,17 @@ struct ReadOptions { // from total_order_seek, based on seek key, and iterator upper bound. // Not supported in ROCKSDB_LITE mode, in the way that even with value true // prefix mode is not used. + // BUG: Using Comparator::IsSameLengthImmediateSuccessor and + // SliceTransform::FullLengthEnabled to enable prefix mode in cases where + // prefix of upper bound differs from prefix of seek key has a flaw. + // If present in the DB, "short keys" (shorter than "full length" prefix) + // can be omitted from auto_prefix_mode iteration when they would be present + // in total_order_seek iteration, regardless of whether the short keys are + // "in domain" of the prefix extractor. This is not an issue if no short + // keys are added to DB or are not expected to be returned by such + // iterators. (We are also assuming the new condition on + // IsSameLengthImmediateSuccessor is satisfied; see its BUG section). + // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG". // Default: false bool auto_prefix_mode; diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 27bc5280b..8909b9c53 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -85,7 +85,10 @@ class SliceTransform : public Customizable { // Otherwise (including FullLengthEnabled returns false, or prefix length is // less than maximum), Seek with auto_prefix_mode is only optimized if the // iterate_upper_bound and seek key have the same prefix. - // NOTE/TODO: We hope to revise this requirement in the future. + // BUG: Despite all these conditions and even with the extra condition on + // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not + // sufficient to ensure auto_prefix_mode returns all entries that + // total_order_seek would return. See auto_prefix_mode "BUG" section. virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. diff --git a/util/comparator.cc b/util/comparator.cc index d04031e39..72584de43 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -209,6 +209,16 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { // Don't do anything for simplicity. } + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + // Always returning false to prevent surfacing design flaws in + // auto_prefix_mode + (void)s, (void)t; + return false; + // "Correct" implementation: + // return BytewiseComparatorImpl::IsSameLengthImmediateSuccessor(t, s); + } + bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; }