From ad135f3ffdf9a932000d289101867a43f101d5e4 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 13 Jun 2022 11:08:50 -0700 Subject: [PATCH] Document design/specification bugs with auto_prefix_mode (#10144) Summary: auto_prefix_mode is designed to use prefix filtering in a particular "safe" set of cases where the upper bound and the seek key have different prefixes: where the upper bound is the "same length immediate successor". These conditions are not sufficient to guarantee the same iteration results as total_order_seek if the DB contains "short" keys, less than the "full" (maximum) prefix length. We are not simply disabling the optimization in these successor cases because it is likely that users are essentially getting what they want out of existing usage. Especially if users are constructing successor bounds with the intention of doing a prefix-bounded seek, the existing behavior is more expected than the total_order_seek behavior. Consequently, for now we reconcile the bad specification of behavior by documenting the existing mismatch with total_order_seek. A closely related issue affects hypothetical comparators like ReverseBytewiseComparator: if they "correctly" implement IsSameLengthImmediateSuccessor, auto_prefix_mode could omit more entries (other than "short" keys noted above). Luckily, the built-in ReverseBytewiseComparator has an "incorrect" implementation of IsSameLengthImmediateSuccessor that effectively prevents prefix optimization and, thus, the bug. This is now documented as a new constraint on IsSameLengthImmediateSuccessor, and the implementation tweaked to be simply "safe" rather than "incorrect". This change also includes unit test updates to demonstrate the above issues. (Test was cleaned up for readability and simplicity.) Intended follow-up: * Tweak documented axioms for prefix_extractor (more details then) * Consider some sort of fix for this case. I don't know what that would look like without breaking the performance of existing code. Perhaps if all keys in an SST file have prefixes that are "full length," we can track that fact and use it to allow optimization with the "same length immediate successor", but that would only apply to new files. * Consider a better system of specifying prefix bounds Pull Request resolved: https://github.com/facebook/rocksdb/pull/10144 Test Plan: test updates included Reviewed By: siying Differential Revision: D37052710 Pulled By: pdillinger fbshipit-source-id: 5f63b7d65f3f214e4b143e0f9aa1749527c587db --- HISTORY.md | 2 + db/comparator_db_test.cc | 46 +++++--- db/db_test2.cc | 189 +++++++++++++++++++++++++----- include/rocksdb/comparator.h | 4 + include/rocksdb/options.h | 11 ++ include/rocksdb/slice_transform.h | 5 +- util/comparator.cc | 10 ++ 7 files changed, 222 insertions(+), 45 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 070a67b4d..736882d6c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,8 @@ * `rocksdb_comparator_with_ts_create` to create timestamp aware comparator * Put, Get, Delete, SingleDelete, MultiGet APIs has corresponding timestamp aware APIs with suffix `with_ts` * And Add C API's for Transaction, SstFileWriter, Compaction as mentioned [here](https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-(Experimental)) +* The contract for implementations of Comparator::IsSameLengthImmediateSuccessor has been updated to work around a design bug in `auto_prefix_mode`. +* The API documentation for `auto_prefix_mode` now notes some corner cases in which it returns different results than `total_order_seek`, due to design bugs that are not easily fixed. Users using built-in comparators and keys at least the size of a fixed prefix length are not affected. ### New Features * Add FileSystem::ReadAsync API in io_tracing diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index c1473a0fe..34a4776f0 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -449,67 +449,85 @@ TEST_P(ComparatorDBTest, TwoStrComparator) { } } +namespace { +void VerifyNotSuccessor(const Slice& s, const Slice& t) { + auto bc = BytewiseComparator(); + auto rbc = ReverseBytewiseComparator(); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); +} + +void VerifySuccessor(const Slice& s, const Slice& t) { + auto bc = BytewiseComparator(); + auto rbc = ReverseBytewiseComparator(); + ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t)); + ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s)); + // Should be true but that increases exposure to a design bug in + // auto_prefix_mode, so currently set to FALSE + ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); +} + +} // namespace + TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { { // different length Slice s("abcxy"); Slice t("abcxyz"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { Slice s("abcxyz"); Slice t("abcxy"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { // not last byte different Slice s("abc1xyz"); Slice t("abc2xyz"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { // same string Slice s("abcxyz"); Slice t("abcxyz"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } { Slice s("abcxy"); Slice t("abcxz"); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); - } - { - Slice s("abcxz"); - Slice t("abcxy"); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xac"; const char t_array[] = "\x50\x8a\xad"; Slice s(s_array); Slice t(t_array); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xff"; const char t_array[] = "\x50\x8b\x00"; Slice s(s_array, 3); Slice t(t_array, 3); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xff\xff"; const char t_array[] = "\x50\x8b\x00\x00"; Slice s(s_array, 4); Slice t(t_array, 4); - ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifySuccessor(s, t); } { const char s_array[] = "\x50\x8a\xff\xff"; const char t_array[] = "\x50\x8b\x00\x01"; Slice s(s_array, 4); Slice t(t_array, 4); - ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + VerifyNotSuccessor(s, t); } } diff --git a/db/db_test2.cc b/db/db_test2.cc index 493e01f35..22d2fbcf9 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -6376,86 +6376,85 @@ TEST_F(DBTest2, AutoPrefixMode1) { ReadOptions ro; ro.total_order_seek = false; ro.auto_prefix_mode = true; + + const auto stat = BLOOM_FILTER_PREFIX_CHECKED; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - std::string ub_str = "b9"; - Slice ub(ub_str); + Slice ub; ro.iterate_upper_bound = &ub; + ub = "b9"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - ub_str = "z"; - ub = Slice(ub_str); + ub = "z"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - ub_str = "c"; - ub = Slice(ub_str); + ub = "c"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); } - // The same queries without recreating iterator + ub = "c1"; { - ub_str = "b9"; - ub = Slice(ub_str); - ro.iterate_upper_bound = &ub; - std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); ASSERT_OK(iterator->status()); + } - ub_str = "z"; - ub = Slice(ub_str); + // The same queries without recreating iterator + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + ub = "b9"; + iterator->Seek("b1"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + ub = "z"; iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); - - ub_str = "c"; - ub = Slice(ub_str); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ub = "c"; iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); - ub_str = "b9"; - ub = Slice(ub_str); - ro.iterate_upper_bound = &ub; + ub = "b9"; iterator->SeekForPrev("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); - ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); - ub_str = "zz"; - ub = Slice(ub_str); - ro.iterate_upper_bound = &ub; + ub = "zz"; iterator->SeekToLast(); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); @@ -6464,6 +6463,136 @@ TEST_F(DBTest2, AutoPrefixMode1) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); } + + // Similar, now with reverse comparator + // Technically, we are violating axiom 2 of prefix_extractors, but + // it should be revised because of major use-cases using + // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME) + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + ASSERT_OK(Flush()); + + { + std::unique_ptr iterator(db_->NewIterator(ro)); + + ub = "b1"; + iterator->Seek("b9"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + ub = "b1"; + iterator->Seek("z"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b1"; + iterator->Seek("c"); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b"; + iterator->Seek("c9"); + ASSERT_FALSE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "a"; + iterator->Seek("b9"); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b"; + iterator->Seek("a"); + ASSERT_FALSE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper + // comparing before seek key prevents a real bug from surfacing. + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "b1"; + iterator->SeekForPrev("b9"); + ASSERT_TRUE(iterator->Valid()); + // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor + // is "correctly" implemented. + ASSERT_EQ("x1", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + + ub = "a"; + iterator->SeekToLast(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + + iterator->SeekToFirst(); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + } + + // Now something a bit different, related to "short" keys that + // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode. + options.comparator = BytewiseComparator(); + for (const auto config : {"fixed:2", "capped:2"}) { + ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config, + &options.prefix_extractor)); + + // FIXME: kHashSearch, etc. requires all keys be InDomain + if (StartsWith(config, "fixed") && + (table_options.index_type == BlockBasedTableOptions::kHashSearch || + StartsWith(options.memtable_factory->Name(), "Hash"))) { + continue; + } + DestroyAndReopen(options); + + const char* a_end_stuff = "a\xffXYZ"; + const char* b_begin_stuff = "b\x00XYZ"; + ASSERT_OK(Put("a", large_value)); + ASSERT_OK(Put("b", large_value)); + ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value)); + ASSERT_OK(Put("c", large_value)); + ASSERT_OK(Flush()); + + // control showing valid optimization with auto_prefix mode + ub = Slice(a_end_stuff, 4); + ro.iterate_upper_bound = &ub; + + std::unique_ptr iterator(db_->NewIterator(ro)); + iterator->Seek(Slice(a_end_stuff, 2)); + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + // test, cannot be validly optimized with auto_prefix_mode + ub = Slice(b_begin_stuff, 2); + ro.iterate_upper_bound = &ub; + + iterator->Seek(Slice(a_end_stuff, 2)); + // !!! BUG !!! See "BUG" section of auto_prefix_mode. + ASSERT_FALSE(iterator->Valid()); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + + // To prove that is the wrong result, now use total order seek + ReadOptions tos_ro = ro; + tos_ro.total_order_seek = true; + tos_ro.auto_prefix_mode = false; + iterator.reset(db_->NewIterator(tos_ro)); + iterator->Seek(Slice(a_end_stuff, 2)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("b", iterator->key().ToString()); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + ASSERT_OK(iterator->status()); + } } while (ChangeOptions(kSkipPlainTable)); } diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 4b1b61eb4..1215c819a 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -96,6 +96,10 @@ class Comparator : public Customizable { virtual const Comparator* GetRootComparator() const { return this; } // given two keys, determine if t is the successor of s + // BUG: only return true if no other keys starting with `t` are ordered + // before `t`. Otherwise, the auto_prefix_mode can omit entries within + // iterator bounds that have same prefix as upper bound but different + // prefix from seek key. virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, const Slice& /*t*/) const { return false; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 8f54d063f..cc175dccc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1530,6 +1530,17 @@ struct ReadOptions { // from total_order_seek, based on seek key, and iterator upper bound. // Not supported in ROCKSDB_LITE mode, in the way that even with value true // prefix mode is not used. + // BUG: Using Comparator::IsSameLengthImmediateSuccessor and + // SliceTransform::FullLengthEnabled to enable prefix mode in cases where + // prefix of upper bound differs from prefix of seek key has a flaw. + // If present in the DB, "short keys" (shorter than "full length" prefix) + // can be omitted from auto_prefix_mode iteration when they would be present + // in total_order_seek iteration, regardless of whether the short keys are + // "in domain" of the prefix extractor. This is not an issue if no short + // keys are added to DB or are not expected to be returned by such + // iterators. (We are also assuming the new condition on + // IsSameLengthImmediateSuccessor is satisfied; see its BUG section). + // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG". // Default: false bool auto_prefix_mode; diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 27bc5280b..8909b9c53 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -85,7 +85,10 @@ class SliceTransform : public Customizable { // Otherwise (including FullLengthEnabled returns false, or prefix length is // less than maximum), Seek with auto_prefix_mode is only optimized if the // iterate_upper_bound and seek key have the same prefix. - // NOTE/TODO: We hope to revise this requirement in the future. + // BUG: Despite all these conditions and even with the extra condition on + // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not + // sufficient to ensure auto_prefix_mode returns all entries that + // total_order_seek would return. See auto_prefix_mode "BUG" section. virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. diff --git a/util/comparator.cc b/util/comparator.cc index d04031e39..72584de43 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -209,6 +209,16 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { // Don't do anything for simplicity. } + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + // Always returning false to prevent surfacing design flaws in + // auto_prefix_mode + (void)s, (void)t; + return false; + // "Correct" implementation: + // return BytewiseComparatorImpl::IsSameLengthImmediateSuccessor(t, s); + } + bool CanKeysWithDifferentByteContentsBeEqual() const override { return false; }