diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 3ed9947c2..48bcaa254 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -3113,6 +3113,382 @@ TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) { it.reset(); } +namespace { +class BackwardBytewiseComparator : public Comparator { + public: + const char* Name() const override { return "BackwardBytewiseComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + int min_size_neg = -static_cast(std::min(a.size(), b.size())); + const char* a_end = a.data() + a.size(); + const char* b_end = b.data() + b.size(); + for (int i = -1; i >= min_size_neg; --i) { + if (a_end[i] != b_end[i]) { + if (static_cast(a_end[i]) < + static_cast(b_end[i])) { + return -1; + } else { + return 1; + } + } + } + return static_cast(a.size()) - static_cast(b.size()); + } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +const BackwardBytewiseComparator kBackwardBytewiseComparator{}; + +class FixedSuffix4Transform : public SliceTransform { + const char* Name() const override { return "FixedSuffixTransform"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data() + src.size() - 4, 4); + } + + bool InDomain(const Slice& src) const override { return src.size() >= 4; } +}; + +std::pair GetBloomStat(const Options& options, bool sst) { + if (sst) { + return { + options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED), + options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)}; + } else { + auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); + auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); + return {hit + miss, miss}; + } +} + +std::pair CheckedAndUseful(uint64_t checked, + uint64_t useful) { + return {checked, useful}; +} +} // namespace + +// This uses a prefix_extractor + comparator combination that violates +// one of the old obsolete, unnecessary axioms of prefix extraction: +// * key.starts_with(prefix(key)) +// This axiom is not really needed, and we validate that here. +TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.comparator = &kBackwardBytewiseComparator; + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + + ASSERT_OK(Put("321aaaa", "val1")); + ASSERT_OK(Put("112aaaa", "val2")); + ASSERT_OK(Put("009aaaa", "val3")); + ASSERT_OK(Put("baa", "val4")); // out of domain + ASSERT_OK(Put("321abaa", "val5")); + ASSERT_OK(Put("zzz", "val6")); // out of domain + + for (auto flushed : {false, true}) { + SCOPED_TRACE("flushed=" + std::to_string(flushed)); + if (flushed) { + ASSERT_OK(Flush()); + } + ReadOptions read_options; + if (flushed) { // TODO: support auto_prefix_mode in memtable? + read_options.auto_prefix_mode = true; + } + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + { + Slice ub("999aaaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("999abaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "abaa"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("999acaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "acaa"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + } + { + Slice ub("zzzz"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "baa"), 3); + if (flushed) { // TODO: fix memtable case + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + } + } +} + +// This uses a prefix_extractor + comparator combination that violates +// one of the old obsolete, unnecessary axioms of prefix extraction: +// * Compare(prefix(key), key) <= 0 +// This axiom is not really needed, and we validate that here. +TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.comparator = ReverseBytewiseComparator(); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + + ASSERT_OK(Put("aaaa123", "val1")); + ASSERT_OK(Put("aaaa211", "val2")); + ASSERT_OK(Put("aaaa900", "val3")); + ASSERT_OK(Put("aab", "val4")); // out of domain + ASSERT_OK(Put("aaba123", "val5")); + ASSERT_OK(Put("qqqq123", "val7")); + ASSERT_OK(Put("qqqq", "val8")); + ASSERT_OK(Put("zzz", "val8")); // out of domain + + for (auto flushed : {false, true}) { + SCOPED_TRACE("flushed=" + std::to_string(flushed)); + if (flushed) { + ASSERT_OK(Flush()); + } + ReadOptions read_options; + if (flushed) { // TODO: support auto_prefix_mode in memtable? + read_options.auto_prefix_mode = true; + } else { + // TODO: why needed? + get_perf_context()->bloom_memtable_hit_count = 0; + get_perf_context()->bloom_memtable_miss_count = 0; + } + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + { + Slice ub("aaaa000"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa999"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix does work as upper bound + Slice ub("aaaa"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa999"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix does not work here as seek key + Slice ub("aaaa500"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("aaba000"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaba999"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("aaca000"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaca999"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + } + { + Slice ub("aaaz"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "zzz"), 5); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + // Note: prefix does work here as seek key, but only finds key equal + // to prefix (others with same prefix are less) + read_options.auto_prefix_mode = false; + read_options.iterate_upper_bound = nullptr; + read_options.prefix_same_as_start = true; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "qqqq"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + } +} + +namespace { +// A weird comparator that in combination with NonIdempotentFixed4Transform +// breaks an old axiom of prefix filtering. +class WeirdComparator : public Comparator { + public: + const char* Name() const override { return "WeirdComparator"; } + + int Compare(const Slice& a, const Slice& b) const override { + bool a_in = a.size() >= 5; + bool b_in = b.size() >= 5; + if (a_in != b_in) { + // Order keys after prefixes + return a_in - b_in; + } + if (a_in) { + return BytewiseComparator()->Compare(a, b); + } else { + // Different ordering on the prefixes + return ReverseBytewiseComparator()->Compare(a, b); + } + } + + void FindShortestSeparator(std::string* /*start*/, + const Slice& /*limit*/) const override {} + + void FindShortSuccessor(std::string* /*key*/) const override {} +}; +const WeirdComparator kWeirdComparator{}; + +// Non-idempotentent because prefix is always 4 bytes, but this is +// out-of-domain for keys to be assigned prefixes (>= 5 bytes) +class NonIdempotentFixed4Transform : public SliceTransform { + const char* Name() const override { return "NonIdempotentFixed4Transform"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 4); + } + + bool InDomain(const Slice& src) const override { return src.size() >= 5; } +}; +} // namespace + +// This uses a prefix_extractor + comparator combination that violates +// two of the old obsolete, unnecessary axioms of prefix extraction: +// * prefix(prefix(key)) == prefix(key) +// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 +// This axiom is not really needed, and we validate that here. +TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.statistics = CreateDBStatistics(); + + for (auto weird_comparator : {false, true}) { + if (weird_comparator) { + options.comparator = &kWeirdComparator; + } + DestroyAndReopen(options); + + ASSERT_OK(Put("aaaa123", "val1")); + ASSERT_OK(Put("aaaa211", "val2")); + ASSERT_OK(Put("aaaa900", "val3")); + ASSERT_OK(Put("aab", "val4")); // out of domain + ASSERT_OK(Put("aaba123", "val5")); + ASSERT_OK(Put("qqqq123", "val7")); + ASSERT_OK(Put("qqqq", "val8")); // out of domain + ASSERT_OK(Put("zzzz", "val8")); // out of domain + + for (auto flushed : {false, true}) { + SCOPED_TRACE("flushed=" + std::to_string(flushed)); + if (flushed) { + ASSERT_OK(Flush()); + } + ReadOptions read_options; + if (flushed) { // TODO: support auto_prefix_mode in memtable? + read_options.auto_prefix_mode = true; + } else { + // TODO: why needed? + get_perf_context()->bloom_memtable_hit_count = 0; + get_perf_context()->bloom_memtable_miss_count = 0; + } + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + { + Slice ub("aaaa999"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa000"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix as seek key is not bloom-optimized + // Note: the count works with weird_comparator because "aaaa" is + // ordered as the last of the prefixes + Slice ub("aaaa999"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaaa"), 3); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + Slice ub("aaba9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaba0"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + Slice ub("aaca9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aaca0"), 0); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + } + { + Slice ub("qqqq9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "qqqq0"), 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + } + { + // Note: prefix as seek key is not bloom-optimized + Slice ub("qqqq9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + // Note: prefix as seek key is not bloom-optimized + Slice ub("zzzz9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + { + Slice ub("zzzz9"); + read_options.iterate_upper_bound = &ub; + std::unique_ptr iter(db_->NewIterator(read_options)); + EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5); + EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + } + } + } +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 2d273c51d..18016e734 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -240,18 +240,36 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API int level0_file_num_compaction_trigger = 4; - // If non-nullptr, use the specified function to determine the - // prefixes for keys. These prefixes will be placed in the filter. - // Depending on the workload, this can reduce the number of read-IOP - // cost for scans when a prefix is passed via ReadOptions to - // db.NewIterator(). For prefix filtering to work properly, - // "prefix_extractor" and "comparator" must be such that the following - // properties hold: - // - // 1) key.starts_with(prefix(key)) - // 2) Compare(prefix(key), key) <= 0. - // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 - // 4) prefix(prefix(key)) == prefix(key) + // If non-nullptr, use the specified function to put keys in contiguous + // groups called "prefixes". These prefixes are used to place one + // representative entry for the group into the Bloom filter + // rather than an entry for each key (see whole_key_filtering). + // Under certain conditions, this enables optimizing some range queries + // (Iterators) in addition to some point lookups (Get/MultiGet). + // + // Together `prefix_extractor` and `comparator` must satisfy one essential + // property for valid prefix filtering of range queries: + // If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and + // InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3), + // Then InDomain(k2) and prefix(k2) == prefix(k1) + // + // In other words, all keys with the same prefix must be in a contiguous + // group by comparator order, and cannot be interrupted by keys with no + // prefix ("out of domain"). (This makes it valid to conclude that no + // entries within some bounds are present if the upper and lower bounds + // have a common prefix and no entries with that same prefix are present.) + // + // Some other properties are recommended but not strictly required. Under + // most sensible comparators, the following will need to hold true to + // satisfy the essential property above: + // * "Prefix is a prefix": key.starts_with(prefix(key)) + // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then + // Compare(prefix(k1), prefix(k2)) <= 0 + // + // The next two properties ensure that seeking to a prefix allows + // enumerating all entries with that prefix: + // * "Prefix starts the group": Compare(prefix(key), key) <= 0 + // * "Prefix idempotent": prefix(prefix(key)) == prefix(key) // // Default: nullptr std::shared_ptr prefix_extractor = nullptr;