From 408205a36b8518eef7837e123b255c79cafed8b0 Mon Sep 17 00:00:00 2001 From: Zhongyi Xie Date: Tue, 26 Jun 2018 15:56:26 -0700 Subject: [PATCH] use user_key and iterate_upper_bound to determine compatibility of bloom filters (#3899) Summary: Previously in https://github.com/facebook/rocksdb/pull/3601 bloom filter will only be checked if `prefix_extractor` in the mutable_cf_options matches the one found in the SST file. This PR relaxes the requirement by checking if all keys in the range [user_key, iterate_upper_bound) all share the same prefix after transforming using the BF in the SST file. If so, the bloom filter is considered compatible and will continue to be looked at. Closes https://github.com/facebook/rocksdb/pull/3899 Differential Revision: D8157459 Pulled By: miasantreble fbshipit-source-id: 18d17cba56a1005162f8d5db7a27aba277089c41 --- db/comparator_db_test.cc | 64 +++++ db/db_bloom_filter_test.cc | 407 ++++++++++++++++++++++++++++++ db/db_test.cc | 174 ------------- db/prefix_test.cc | 4 + include/rocksdb/comparator.h | 6 + include/rocksdb/options.h | 2 +- include/rocksdb/slice_transform.h | 5 + options/options_helper.cc | 2 +- options/options_helper.h | 4 + table/block_based_table_reader.cc | 121 +++++---- table/block_based_table_reader.h | 24 +- table/filter_block.h | 11 + table/full_filter_block.cc | 57 +++++ table/full_filter_block.h | 11 +- table/table_test.cc | 3 + util/comparator.cc | 26 ++ util/slice.cc | 10 + 17 files changed, 700 insertions(+), 231 deletions(-) diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index dc362b9c2..226451464 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -449,6 +449,70 @@ TEST_P(ComparatorDBTest, TwoStrComparator) { } } +TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { + { + // different length + Slice s("abcxy"); + Slice t("abcxyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxyz"); + Slice t("abcxy"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + // not last byte different + Slice s("abc1xyz"); + Slice t("abc2xyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + // same string + Slice s("abcxyz"); + Slice t("abcxyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxy"); + Slice t("abcxz"); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxz"); + Slice t("abcxy"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xac"; + const char t_array[] = "\x50\x8a\xad"; + Slice s(s_array); + Slice t(t_array); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff"; + const char t_array[] = "\x50\x8b\x00"; + Slice s(s_array, 3); + Slice t(t_array, 3); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x00"; + Slice s(s_array, 4); + Slice t(t_array, 4); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x01"; + Slice s(s_array, 4); + Slice t(t_array, 4); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } +} + TEST_P(ComparatorDBTest, FindShortestSeparator) { std::string s1 = "abc1xyz"; std::string s2 = "abc3xy"; diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index d4b034c53..4a97f93d7 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1097,6 +1097,413 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { TestGetTickerCount(options, BLOCK_CACHE_ADD)); } +int CountIter(std::unique_ptr& iter, const Slice& key) { + int count = 0; + for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); + iter->Next()) { + count++; + } + return count; +} + +// use iterate_upper_bound to hint compatiability of existing bloom filters. +// The BF is considered compatible if 1) upper bound and seek key transform +// into the same string, or 2) the transformed seek key is of the same length +// as the upper bound and two keys are adjacent according to the comparator. +TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("abcdxxx0", "val1")); + ASSERT_OK(Put("abcdxxx1", "val2")); + ASSERT_OK(Put("abcdxxx2", "val3")); + ASSERT_OK(Put("abcdxxx3", "val4")); + dbfull()->Flush(FlushOptions()); + { + // prefix_extractor has not changed, BF will always be read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + } + { + Slice upper_bound("abcdzzzz"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.FixedPrefix.5")); + { + // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); + // should check bloom filter since upper bound meets requirement + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx01, abcey) is not valid bound since upper bound is too long for + // the BF in SST (capped:4) + Slice upper_bound("abcey"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); + // should skip bloom filter since upper bound is too long + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx02, abcdy) is a valid bound since the prefix is the same + Slice upper_bound("abcdy"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); + // should check bloom filter since upper bound matches transformed seek + // key + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the + // same prefix, 2) the prefixes are not consecutive + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); + // should skip bloom filter since mismatch is found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); + { + // [abc, abd) is not a valid bound since the upper bound is too short + // for BF (capped:4) + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); + { + // set back to capped:4 and verify BF is always read + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + iteration++; + } +} + +// Create multiple SST files each with a different prefix_extractor config, +// verify iterators can read all SST files using the latest config. +TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Slice upper_bound("foz90000"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + + // first SST with fixed:1 BF + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foq1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + iteration); + ASSERT_EQ(CountIter(iter, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + iteration); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + // second SST with capped:3 BF + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foq5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + { + // BF is cappped:3 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // both counters are incremented because BF is "not changed" for 1 of the + // 2 SST files, so filter is checked once and found no match. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.FixedPrefix.2")); + // third SST with fixed:2 BF + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foq8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + { + // BF is fixed:2 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); + // the first and last BF are checked + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 4 + iteration * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // only last BF is checked and not found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 5 + iteration * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + + // iter_old can only see the first SST, so checked plus 1 + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + iteration * 3); + // iter was created after the first setoptions call so only full filter + // will check the filter + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + iteration * 4); + + { + // keys in all three SSTs are visible to iterator + // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) + // so +2 for checked counter + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 7 + iteration * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 8 + iteration * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + { + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 6); + // all three SST are checked because the current options has the same as + // the remaining SST (capped:3) + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 9 + iteration * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 10 + iteration * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + } + // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? + iteration++; + } +} + +// Create a new column family in a running DB, change prefix_extractor +// dynamically, verify the iterator created on the new column family behaves +// as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + // create a new CF and set prefix_extractor dynamically + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options); + ASSERT_EQ(0, + strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + ASSERT_OK(Put(2, "foo3", "bar3")); + ASSERT_OK(Put(2, "foo4", "bar4")); + ASSERT_OK(Put(2, "foo5", "bar5")); + ASSERT_OK(Put(2, "foq6", "bar6")); + ASSERT_OK(Put(2, "fpq7", "bar7")); + dbfull()->Flush(FlushOptions()); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK( + dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(0, + strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), + "rocksdb.FixedPrefix.2")); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); + dbfull()->DestroyColumnFamilyHandle(handles_[2]); + handles_[2] = nullptr; + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + dbfull()->DestroyColumnFamilyHandle(handles_[1]); + handles_[1] = nullptr; + iteration++; + } +} + +// Verify it's possible to change prefix_extractor at runtime and iterators +// behaves as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foo5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foo8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + + ReadOptions read_options; + read_options.prefix_same_as_start = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + // "fp*" should be skipped + ASSERT_EQ(CountIter(iter, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + + // iterator created before should not be affected and see all keys + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_old, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + iteration++; + } +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_test.cc b/db/db_test.cc index afff08673..346010bcd 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4607,180 +4607,6 @@ TEST_F(DBTest, FileCreationRandomFailure) { } #ifndef ROCKSDB_LITE -int CountIter(Iterator* iter, const Slice& key) { - int count = 0; - for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); - iter->Next()) { - count++; - } - return count; -} - -// Create multiple SST files each with a different prefix_extractor config, -// verify iterators can read all SST files using the latest config. -TEST_F(DBTest, DynamicBloomFilterMultipleSST) { - Options options; - options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.disable_auto_compactions = true; - // Enable prefix bloom for SST files - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - - ReadOptions read_options; - read_options.prefix_same_as_start = true; - - // first SST with fixed:1 BF - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foq1", "bar1")); - ASSERT_OK(Put("fpa", "0")); - dbfull()->Flush(FlushOptions()); - Iterator* iter_old = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_old, "foo"), 4); - - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - Iterator* iter = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter, "foo"), 2); - - // second SST with capped:3 BF - ASSERT_OK(Put("foo3", "bar3")); - ASSERT_OK(Put("foo4", "bar4")); - ASSERT_OK(Put("foq5", "bar5")); - ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); - // BF is cappped:3 now - Iterator* iter_tmp = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); - delete iter_tmp; - - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.FixedPrefix.2")); - // third SST with fixed:2 BF - ASSERT_OK(Put("foo6", "bar6")); - ASSERT_OK(Put("foo7", "bar7")); - ASSERT_OK(Put("foq8", "bar8")); - ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); - // BF is fixed:2 now - iter_tmp = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); - delete iter_tmp; - - // TODO(Zhongyi): verify existing iterator cannot see newly inserted keys - ASSERT_EQ(CountIter(iter_old, "foo"), 4); - ASSERT_EQ(CountIter(iter, "foo"), 2); - delete iter; - delete iter_old; - - // keys in all three SSTs are visible to iterator - Iterator* iter_all = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_all, "foo"), 9); - delete iter_all; - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - iter_all = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_all, "foo"), 6); - delete iter_all; - // TODO(Zhongyi): add test for cases where certain SST are skipped - // Also verify BF related counters like BLOOM_FILTER_USEFUL -} - -// Create a new column family in a running DB, change prefix_extractor -// dynamically, verify the iterator created on the new column family behaves -// as expected -TEST_F(DBTest, DynamicBloomFilterNewColumnFamily) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.disable_auto_compactions = true; - // Enable prefix bloom for SST files - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - ReadOptions read_options; - read_options.prefix_same_as_start = true; - // create a new CF and set prefix_extractor dynamically - options.prefix_extractor.reset(NewCappedPrefixTransform(3)); - CreateColumnFamilies({"ramen_dojo"}, options); - ASSERT_EQ(0, - strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - ASSERT_OK(Put(2, "foo3", "bar3")); - ASSERT_OK(Put(2, "foo4", "bar4")); - ASSERT_OK(Put(2, "foo5", "bar5")); - ASSERT_OK(Put(2, "foq6", "bar6")); - ASSERT_OK(Put(2, "fpq7", "bar7")); - dbfull()->Flush(FlushOptions()); - Iterator* iter = db_->NewIterator(read_options, handles_[2]); - ASSERT_EQ(CountIter(iter, "foo"), 3); - delete iter; - ASSERT_OK( - dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); - ASSERT_EQ(0, - strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), - "rocksdb.FixedPrefix.2")); - iter = db_->NewIterator(read_options, handles_[2]); - ASSERT_EQ(CountIter(iter, "foo"), 4); - delete iter; -} - -// Verify it's possible to change prefix_extractor at runtime and iterators -// behaves as expected -TEST_F(DBTest, DynamicBloomFilterOptions) { - Options options; - options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.disable_auto_compactions = true; - // Enable prefix bloom for SST files - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("fpa", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("foo3", "bar3")); - ASSERT_OK(Put("foo4", "bar4")); - ASSERT_OK(Put("foo5", "bar5")); - ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("foo6", "bar6")); - ASSERT_OK(Put("foo7", "bar7")); - ASSERT_OK(Put("foo8", "bar8")); - ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); - - ReadOptions read_options; - read_options.prefix_same_as_start = true; - Iterator* iter = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter, "foo"), 12); - delete iter; - Iterator* iter_old = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_old, "foo"), 12); - - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - iter = db_->NewIterator(read_options); - // "fp*" should be skipped - ASSERT_EQ(CountIter(iter, "foo"), 9); - delete iter; - - // iterator created before should not be affected and see all keys - ASSERT_EQ(CountIter(iter_old, "foo"), 12); - delete iter_old; -} TEST_F(DBTest, DynamicMiscOptions) { // Test max_sequential_skip_in_iterations diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 274ad3ad4..eeecdfa02 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -212,6 +212,10 @@ class SamePrefixTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const override { return dst == prefix_; } + + virtual bool FullLengthEnabled(size_t* /*len*/) const override { + return false; + } }; } // namespace diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 6479a28bc..6914f812b 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -68,6 +68,12 @@ class Comparator { // if it is a wrapped comparator, may return the root one. // return itself it is not wrapped. virtual const Comparator* GetRootComparator() const { return this; } + + // given two keys, determine if t is the successor of s + virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, + const Slice& /*t*/) const { + return false; + } }; // Return a builtin comparator that uses lexicographic byte-wise diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 74d7fef0a..d77136d05 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1015,7 +1015,7 @@ struct ReadOptions { // can returns entries. Once the bound is reached, Valid() will be false. // "iterate_upper_bound" is exclusive ie the bound value is // not a valid entry. If iterator_extractor is not null, the Seek target - // and iterator_upper_bound need to have the same prefix. + // and iterate_upper_bound need to have the same prefix. // This is because ordering is not guaranteed outside of prefix domain. // // Default: nullptr diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 39999cc62..5a461b776 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -60,6 +60,11 @@ class SliceTransform { // This is currently not used and remains here for backward compatibility. virtual bool InRange(const Slice& /*dst*/) const { return false; } + // Some SliceTransform will have a full length which can be used to + // determine if two keys are consecuitive. Can be disabled by always + // returning 0 + virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. // // This function is not used by RocksDB, but for users. If users pass diff --git a/options/options_helper.cc b/options/options_helper.cc index eba08cb85..40f468b43 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -379,6 +379,7 @@ bool ParseStructOptions( } return true; } +} // anonymouse namespace bool ParseSliceTransformHelper( const std::string& kFixedPrefixName, const std::string& kCappedPrefixName, @@ -434,7 +435,6 @@ bool ParseSliceTransform( // SliceTransforms here. return false; } -} // anonymouse namespace bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, const std::string& value) { diff --git a/options/options_helper.h b/options/options_helper.h index ab91109be..2ec22077a 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -130,6 +130,10 @@ Status GetColumnFamilyOptionsFromMapInternal( std::vector* unsupported_options_names = nullptr, bool ignore_unknown_options = false); +bool ParseSliceTransform( + const std::string& value, + std::shared_ptr* slice_transform); + extern Status StringToMap( const std::string& opts_str, std::unordered_map* opts_map); diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index f7b6cc4d7..cb327ad5e 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -249,8 +249,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { index_block_->NewIterator(icomparator_, icomparator_->user_comparator(), nullptr, true, nullptr, index_key_includes_seq_), - false, - /* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_); + false, true, /* prefix_extractor */ nullptr, kIsIndex, + index_key_includes_seq_); } // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes @@ -824,6 +824,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, ROCKS_LOG_ERROR(rep->ioptions.info_log, "Cannot find Properties block from file."); } +#ifndef ROCKSDB_LITE + if (rep->table_properties) { + ParseSliceTransform(rep->table_properties->prefix_extractor_name, + &(rep->table_prefix_extractor)); + } +#endif // ROCKSDB_LITE // Read the compression dictionary meta block bool found_compression_dict; @@ -898,6 +904,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->ioptions.info_log); } + bool need_upper_bound_check = + PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor); + // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); @@ -932,14 +941,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Hack: Call NewIndexIterator() to implicitly add index to the // block_cache CachableEntry index_entry; - bool prefix_extractor_changed = false; // check prefix_extractor match only if hash based index is used - if (rep->index_type == BlockBasedTableOptions::kHashSearch) { - prefix_extractor_changed = PrefixExtractorChanged( - rep->table_properties.get(), prefix_extractor); - } + bool disable_prefix_seek = + rep->index_type == BlockBasedTableOptions::kHashSearch && + need_upper_bound_check; unique_ptr iter(new_table->NewIndexIterator( - ReadOptions(), prefix_extractor_changed, nullptr, &index_entry)); + ReadOptions(), disable_prefix_seek, nullptr, &index_entry)); s = iter->status(); if (s.ok()) { // This is the first call to NewIndexIterator() since we're in Open(). @@ -958,9 +965,11 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, } if (s.ok() && prefetch_filter) { // Hack: Call GetFilter() to implicitly add filter to the block_cache - auto filter_entry = new_table->GetFilter(prefix_extractor); + auto filter_entry = + new_table->GetFilter(rep->table_prefix_extractor.get()); if (filter_entry.value != nullptr && prefetch_all) { - filter_entry.value->CacheDependencies(pin_all, prefix_extractor); + filter_entry.value->CacheDependencies( + pin_all, rep->table_prefix_extractor.get()); } // if pin_filter is true then save it in rep_->filter_entry; it will be // released in the destructor only, hence it will be pinned in the @@ -990,14 +999,14 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Set filter block if (rep->filter_policy) { const bool is_a_filter_partition = true; - auto filter = - new_table->ReadFilter(prefetch_buffer.get(), rep->filter_handle, - !is_a_filter_partition, prefix_extractor); + auto filter = new_table->ReadFilter( + prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition, + rep->table_prefix_extractor.get()); rep->filter.reset(filter); // Refer to the comment above about paritioned indexes always being // cached if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { - filter->CacheDependencies(pin_all, prefix_extractor); + filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get()); } } } else { @@ -1784,20 +1793,28 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( // Otherwise, this method guarantees no I/O will be incurred. // // REQUIRES: this method shouldn't be called while the DB lock is held. -bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, - const SliceTransform* prefix_extractor) { +bool BlockBasedTable::PrefixMayMatch( + const Slice& internal_key, const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check) { if (!rep_->filter_policy) { return true; } - assert(prefix_extractor != nullptr); + const SliceTransform* prefix_extractor; + + if (rep_->table_prefix_extractor == nullptr) { + if (need_upper_bound_check) { + return true; + } + prefix_extractor = options_prefix_extractor; + } else { + prefix_extractor = rep_->table_prefix_extractor.get(); + } auto user_key = ExtractUserKey(internal_key); if (!prefix_extractor->InDomain(user_key)) { return true; } - assert(rep_->table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) == 0); - auto prefix = prefix_extractor->Transform(user_key); bool may_match = true; Status s; @@ -1805,12 +1822,23 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, // First, try check with full filter auto filter_entry = GetFilter(prefix_extractor); FilterBlockReader* filter = filter_entry.value; + bool filter_checked = true; if (filter != nullptr) { if (!filter->IsBlockBased()) { const Slice* const const_ikey_ptr = &internal_key; - may_match = filter->PrefixMayMatch(prefix, prefix_extractor, kNotValid, - false, const_ikey_ptr); + may_match = filter->RangeMayExist( + read_options.iterate_upper_bound, user_key, prefix_extractor, + rep_->internal_comparator.user_comparator(), const_ikey_ptr, + &filter_checked, need_upper_bound_check); } else { + // if prefix_extractor changed for block based filter, skip filter + if (need_upper_bound_check) { + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return true; + } + auto prefix = prefix_extractor->Transform(user_key); InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1823,9 +1851,9 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - bool prefix_extractor_changed = false; unique_ptr iiter( - NewIndexIterator(no_io_read_options, prefix_extractor_changed)); + NewIndexIterator(no_io_read_options, + /* need_upper_bound_check */ false)); iiter->Seek(internal_prefix); if (!iiter->Valid()) { @@ -1866,10 +1894,12 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, } } - Statistics* statistics = rep_->ioptions.statistics; - RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); - if (!may_match) { - RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + if (filter_checked) { + Statistics* statistics = rep_->ioptions.statistics; + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } } // if rep_->filter_entry is not set, we should call Release(); otherwise @@ -1878,12 +1908,11 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, if (!rep_->filter_entry.IsSet()) { filter_entry.Release(rep_->table_options.block_cache.get()); } - return may_match; } void BlockBasedTableIterator::Seek(const Slice& target) { - if (!CheckPrefixMayMatch(target, prefix_extractor_)) { + if (!CheckPrefixMayMatch(target)) { ResetDataIter(); return; } @@ -1911,7 +1940,7 @@ void BlockBasedTableIterator::Seek(const Slice& target) { } void BlockBasedTableIterator::SeekForPrev(const Slice& target) { - if (!CheckPrefixMayMatch(target, prefix_extractor_)) { + if (!CheckPrefixMayMatch(target)) { ResetDataIter(); return; } @@ -2101,7 +2130,7 @@ void BlockBasedTableIterator::FindKeyBackward() { InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, bool for_compaction) { - bool prefix_extractor_changed = + bool need_upper_bound_check = PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); const bool kIsNotIndex = false; if (arena == nullptr) { @@ -2109,21 +2138,21 @@ InternalIterator* BlockBasedTable::NewIterator( this, read_options, rep_->internal_comparator, NewIndexIterator( read_options, - prefix_extractor_changed && + need_upper_bound_check && rep_->index_type == BlockBasedTableOptions::kHashSearch), !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr && !prefix_extractor_changed, - prefix_extractor, kIsNotIndex, true /*key_includes_seq*/, - for_compaction); + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, kIsNotIndex, + true /*key_includes_seq*/, for_compaction); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, - NewIndexIterator(read_options, prefix_extractor_changed), + NewIndexIterator(read_options, need_upper_bound_check), !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr && !prefix_extractor_changed, - prefix_extractor, kIsNotIndex, true /*key_includes_seq*/, - for_compaction); + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, kIsNotIndex, + true /*key_includes_seq*/, for_compaction); } } @@ -2210,14 +2239,14 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, BlockIter iiter_on_stack; // if prefix_extractor found in block differs from options, disable // BlockPrefixIndex. Only do this check when index_type is kHashSearch. - bool prefix_extractor_changed = false; + bool need_upper_bound_check = false; if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { - prefix_extractor_changed = PrefixExtractorChanged( + need_upper_bound_check = PrefixExtractorChanged( rep_->table_properties.get(), prefix_extractor); } - auto iiter = NewIndexIterator(read_options, prefix_extractor_changed, - &iiter_on_stack, /* index_entry */ nullptr, - get_context); + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + /* index_entry */ nullptr, get_context); std::unique_ptr iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -2479,7 +2508,7 @@ Status BlockBasedTable::CreateIndexReader( // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. - // Add prefix_extractor_changed flag in BlockBasedTable::NewIndexIterator. + // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. // If prefix_extractor does not match prefix_extractor_name from table // properties, turn off Hash Index by setting total_order_seek to true diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 53ba5e197..821e672a6 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -95,7 +95,9 @@ class BlockBasedTable : public TableReader { bool skip_filters = false, int level = -1); bool PrefixMayMatch(const Slice& internal_key, - const SliceTransform* prefix_extractor = nullptr); + const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check); // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must @@ -277,7 +279,7 @@ class BlockBasedTable : public TableReader { // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier InternalIterator* NewIndexIterator( - const ReadOptions& read_options, bool prefix_extractor_changed = false, + const ReadOptions& read_options, bool need_upper_bound_check = false, BlockIter* input_iter = nullptr, CachableEntry* index_entry = nullptr, GetContext* get_context = nullptr); @@ -481,6 +483,7 @@ struct BlockBasedTable::Rep { // and compatible with existing code, we introduce a wrapper that allows // block to extract prefix without knowing if a key is internal or not. unique_ptr internal_prefix_transform; + std::shared_ptr table_prefix_extractor; // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is // true or in all levels when pin_top_level_index_and_filter is set in @@ -515,6 +518,7 @@ class BlockBasedTableIterator : public InternalIterator { const ReadOptions& read_options, const InternalKeyComparator& icomp, InternalIterator* index_iter, bool check_filter, + bool need_upper_bound_check, const SliceTransform* prefix_extractor, bool is_index, bool key_includes_seq = true, bool for_compaction = false) @@ -525,10 +529,11 @@ class BlockBasedTableIterator : public InternalIterator { pinned_iters_mgr_(nullptr), block_iter_points_to_real_block_(false), check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check), + prefix_extractor_(prefix_extractor), is_index_(is_index), key_includes_seq_(key_includes_seq), - for_compaction_(for_compaction), - prefix_extractor_(prefix_extractor) {} + for_compaction_(for_compaction) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -575,9 +580,10 @@ class BlockBasedTableIterator : public InternalIterator { block_iter_points_to_real_block_; } - bool CheckPrefixMayMatch(const Slice& ikey, - const SliceTransform* prefix_extractor = nullptr) { - if (check_filter_ && !table_->PrefixMayMatch(ikey, prefix_extractor)) { + bool CheckPrefixMayMatch(const Slice& ikey) { + if (check_filter_ && + !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_)) { // TODO remember the iterator is invalidated because of prefix // match. This can avoid the upper level file iterator to falsely // believe the position is the end of the SST file and move to @@ -621,6 +627,9 @@ class BlockBasedTableIterator : public InternalIterator { bool block_iter_points_to_real_block_; bool is_out_of_bound_ = false; bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + const SliceTransform* prefix_extractor_; // If the blocks over which we iterate are index blocks bool is_index_; // If the keys in the blocks over which we iterate include 8 byte sequence @@ -629,7 +638,6 @@ class BlockBasedTableIterator : public InternalIterator { bool for_compaction_; // TODO use block offset instead std::string prev_index_value_; - const SliceTransform* prefix_extractor_; static const size_t kInitReadaheadSize = 8 * 1024; // Found that 256 KB readahead size provides the best performance, based on diff --git a/table/filter_block.h b/table/filter_block.h index 8f3666e1b..a93049547 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -123,6 +123,17 @@ class FilterBlockReader { virtual void CacheDependencies(bool /*pin*/, const SliceTransform* /*prefix_extractor*/) {} + virtual bool RangeMayExist( + const Slice* /*iterate_upper_bound*/, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, + bool* filter_checked, bool /*need_upper_bound_check*/) { + *filter_checked = true; + Slice prefix = prefix_extractor->Transform(user_key); + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr); + } + protected: bool whole_key_filtering_; diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc index 150caa6d1..fec42c407 100644 --- a/table/full_filter_block.cc +++ b/table/full_filter_block.cc @@ -98,6 +98,10 @@ FullFilterBlockReader::FullFilterBlockReader( contents_(contents) { assert(filter_bits_reader != nullptr); filter_bits_reader_.reset(filter_bits_reader); + if (prefix_extractor_ != nullptr) { + full_length_enabled_ = + prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_); + } } FullFilterBlockReader::FullFilterBlockReader( @@ -153,4 +157,57 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { size_t FullFilterBlockReader::ApproximateMemoryUsage() const { return contents_.size(); } + +bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, + const Slice& user_key, const SliceTransform* prefix_extractor, + const Comparator* comparator, const Slice* const const_ikey_ptr, + bool* filter_checked, bool need_upper_bound_check) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { + *filter_checked = false; + return true; + } + Slice prefix = prefix_extractor->Transform(user_key); + if (need_upper_bound_check && + !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { + *filter_checked = false; + return true; + } else { + *filter_checked = true; + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr); + } +} + +bool FullFilterBlockReader::IsFilterCompatible( + const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) { + // Try to reuse the bloom filter in the SST table if prefix_extractor in + // mutable_cf_options has changed. If range [user_key, upper_bound) all + // share the same prefix then we may still be able to use the bloom filter. + if (iterate_upper_bound != nullptr && prefix_extractor_) { + if (!prefix_extractor_->InDomain(*iterate_upper_bound)) { + return false; + } + Slice upper_bound_xform = + prefix_extractor_->Transform(*iterate_upper_bound); + // first check if user_key and upper_bound all share the same prefix + if (!comparator->Equal(prefix, upper_bound_xform)) { + // second check if user_key's prefix is the immediate predecessor of + // upper_bound and have the same length. If so, we know for sure all + // keys in the range [user_key, upper_bound) share the same prefix. + // Also need to make sure upper_bound are full length to ensure + // correctness + if (!full_length_enabled_ || + iterate_upper_bound->size() != prefix_extractor_full_length_ || + !comparator->IsSameLengthImmediateSuccessor(prefix, + *iterate_upper_bound)) { + return false; + } + } + return true; + } else { + return false; + } +} + } // namespace rocksdb diff --git a/table/full_filter_block.h b/table/full_filter_block.h index efa93da12..aed4cfd7d 100644 --- a/table/full_filter_block.h +++ b/table/full_filter_block.h @@ -108,18 +108,27 @@ class FullFilterBlockReader : public FilterBlockReader { uint64_t block_offset = kNotValid, const bool no_io = false, const Slice* const const_ikey_ptr = nullptr) override; virtual size_t ApproximateMemoryUsage() const override; - + virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check) override; private: const SliceTransform* prefix_extractor_; Slice contents_; std::unique_ptr filter_bits_reader_; BlockContents block_contents_; std::unique_ptr filter_data_; + bool full_length_enabled_; + size_t prefix_extractor_full_length_; // No copying allowed FullFilterBlockReader(const FullFilterBlockReader&); bool MayMatch(const Slice& entry); void operator=(const FullFilterBlockReader&); + bool IsFilterCompatible(const Slice* iterate_upper_bound, + const Slice& prefix, const Comparator* comparator); + }; } // namespace rocksdb diff --git a/table/table_test.cc b/table/table_test.cc index bada1a2c4..2054d036d 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -692,6 +692,9 @@ class FixedOrLessPrefixTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const override { return (dst.size() <= prefix_len_); } + virtual bool FullLengthEnabled(size_t* /*len*/) const override { + return false; + } }; class HarnessTest : public testing::Test { diff --git a/util/comparator.cc b/util/comparator.cc index 007f449a8..2c427a7ac 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -98,6 +98,32 @@ class BytewiseComparatorImpl : public Comparator { } // *key is a run of 0xffs. Leave it alone. } + + virtual bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + if (s.size() != t.size() || s.size() == 0) { + return false; + } + size_t diff_ind = s.difference_offset(t); + // same slice + if (diff_ind >= s.size()) return false; + uint8_t byte_s = static_cast(s[diff_ind]); + uint8_t byte_t = static_cast(t[diff_ind]); + // first different byte must be consecutive, and remaining bytes must be + // 0xff for s and 0x00 for t + if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) { + for (size_t i = diff_ind + 1; i < s.size(); ++i) { + byte_s = static_cast(s[i]); + byte_t = static_cast(t[i]); + if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) { + return false; + } + } + return true; + } else { + return false; + } + } }; class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { diff --git a/util/slice.cc b/util/slice.cc index d344fbacf..ed8c48169 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -47,6 +47,11 @@ class FixedPrefixTransform : public SliceTransform { return (dst.size() == prefix_len_); } + virtual bool FullLengthEnabled(size_t* len) const override { + *len = prefix_len_; + return true; + } + virtual bool SameResultWhenAppended(const Slice& prefix) const override { return InDomain(prefix); } @@ -80,6 +85,11 @@ class CappedPrefixTransform : public SliceTransform { return (dst.size() <= cap_len_); } + virtual bool FullLengthEnabled(size_t* len) const override { + *len = cap_len_; + return true; + } + virtual bool SameResultWhenAppended(const Slice& prefix) const override { return prefix.size() >= cap_len_; }