use user_key and iterate_upper_bound to determine compatibility of bloom filters (#3899)

Summary:
Previously in https://github.com/facebook/rocksdb/pull/3601 bloom filter will only be checked if `prefix_extractor` in the mutable_cf_options matches the one found in the SST file.
This PR relaxes the requirement by checking if all keys in the range [user_key, iterate_upper_bound) all share the same prefix after transforming using the BF in the SST file. If so, the bloom filter is considered compatible and will continue to be looked at.
Closes https://github.com/facebook/rocksdb/pull/3899

Differential Revision: D8157459

Pulled By: miasantreble

fbshipit-source-id: 18d17cba56a1005162f8d5db7a27aba277089c41
main
Zhongyi Xie 6 years ago committed by Facebook Github Bot
parent 967aa8157a
commit 408205a36b
  1. 64
      db/comparator_db_test.cc
  2. 407
      db/db_bloom_filter_test.cc
  3. 174
      db/db_test.cc
  4. 4
      db/prefix_test.cc
  5. 6
      include/rocksdb/comparator.h
  6. 2
      include/rocksdb/options.h
  7. 5
      include/rocksdb/slice_transform.h
  8. 2
      options/options_helper.cc
  9. 4
      options/options_helper.h
  10. 121
      table/block_based_table_reader.cc
  11. 24
      table/block_based_table_reader.h
  12. 11
      table/filter_block.h
  13. 57
      table/full_filter_block.cc
  14. 11
      table/full_filter_block.h
  15. 3
      table/table_test.cc
  16. 26
      util/comparator.cc
  17. 10
      util/slice.cc

@ -449,6 +449,70 @@ TEST_P(ComparatorDBTest, TwoStrComparator) {
} }
} }
TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
{
// different length
Slice s("abcxy");
Slice t("abcxyz");
ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
Slice s("abcxyz");
Slice t("abcxy");
ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
// not last byte different
Slice s("abc1xyz");
Slice t("abc2xyz");
ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
// same string
Slice s("abcxyz");
Slice t("abcxyz");
ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
Slice s("abcxy");
Slice t("abcxz");
ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
Slice s("abcxz");
Slice t("abcxy");
ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
const char s_array[] = "\x50\x8a\xac";
const char t_array[] = "\x50\x8a\xad";
Slice s(s_array);
Slice t(t_array);
ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
const char s_array[] = "\x50\x8a\xff";
const char t_array[] = "\x50\x8b\x00";
Slice s(s_array, 3);
Slice t(t_array, 3);
ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
const char s_array[] = "\x50\x8a\xff\xff";
const char t_array[] = "\x50\x8b\x00\x00";
Slice s(s_array, 4);
Slice t(t_array, 4);
ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
{
const char s_array[] = "\x50\x8a\xff\xff";
const char t_array[] = "\x50\x8b\x00\x01";
Slice s(s_array, 4);
Slice t(t_array, 4);
ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
}
}
TEST_P(ComparatorDBTest, FindShortestSeparator) { TEST_P(ComparatorDBTest, FindShortestSeparator) {
std::string s1 = "abc1xyz"; std::string s1 = "abc1xyz";
std::string s2 = "abc3xy"; std::string s2 = "abc3xy";

@ -1097,6 +1097,413 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
TestGetTickerCount(options, BLOCK_CACHE_ADD)); TestGetTickerCount(options, BLOCK_CACHE_ADD));
} }
int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
int count = 0;
for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
iter->Next()) {
count++;
}
return count;
}
// use iterate_upper_bound to hint compatiability of existing bloom filters.
// The BF is considered compatible if 1) upper bound and seek key transform
// into the same string, or 2) the transformed seek key is of the same length
// as the upper bound and two keys are adjacent according to the comparator.
TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewCappedPrefixTransform(4));
options.disable_auto_compactions = true;
options.statistics = CreateDBStatistics();
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
ASSERT_OK(Put("abcdxxx0", "val1"));
ASSERT_OK(Put("abcdxxx1", "val2"));
ASSERT_OK(Put("abcdxxx2", "val3"));
ASSERT_OK(Put("abcdxxx3", "val4"));
dbfull()->Flush(FlushOptions());
{
// prefix_extractor has not changed, BF will always be read
Slice upper_bound("abce");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
}
{
Slice upper_bound("abcdzzzz");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.FixedPrefix.5"));
{
// BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
Slice upper_bound("abce");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
// should check bloom filter since upper bound meets requirement
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
{
// [abcdxx01, abcey) is not valid bound since upper bound is too long for
// the BF in SST (capped:4)
Slice upper_bound("abcey");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
// should skip bloom filter since upper bound is too long
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
{
// [abcdxx02, abcdy) is a valid bound since the prefix is the same
Slice upper_bound("abcdy");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
// should check bloom filter since upper bound matches transformed seek
// key
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
{
// [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
// same prefix, 2) the prefixes are not consecutive
Slice upper_bound("abce");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
// should skip bloom filter since mismatch is found
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
{
// [abc, abd) is not a valid bound since the upper bound is too short
// for BF (capped:4)
Slice upper_bound("abd");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abc"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
{
// set back to capped:4 and verify BF is always read
Slice upper_bound("abd");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "abc"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
3 + iteration * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
}
iteration++;
}
}
// Create multiple SST files each with a different prefix_extractor config,
// verify iterators can read all SST files using the latest config.
TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.disable_auto_compactions = true;
options.statistics = CreateDBStatistics();
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
table_options.cache_index_and_filter_blocks = true;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
Slice upper_bound("foz90000");
ReadOptions read_options;
read_options.prefix_same_as_start = true;
// first SST with fixed:1 BF
ASSERT_OK(Put("foo2", "bar2"));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put("foq1", "bar1"));
ASSERT_OK(Put("fpa", "0"));
dbfull()->Flush(FlushOptions());
std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_old, "foo"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
read_options.iterate_upper_bound = &upper_bound;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "foo"), 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
1 + iteration);
ASSERT_EQ(CountIter(iter, "gpk"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
1 + iteration);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
// second SST with capped:3 BF
ASSERT_OK(Put("foo3", "bar3"));
ASSERT_OK(Put("foo4", "bar4"));
ASSERT_OK(Put("foq5", "bar5"));
ASSERT_OK(Put("fpb", "1"));
dbfull()->Flush(FlushOptions());
{
// BF is cappped:3 now
std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
2 + iteration * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
// both counters are incremented because BF is "not changed" for 1 of the
// 2 SST files, so filter is checked once and found no match.
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
3 + iteration * 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.FixedPrefix.2"));
// third SST with fixed:2 BF
ASSERT_OK(Put("foo6", "bar6"));
ASSERT_OK(Put("foo7", "bar7"));
ASSERT_OK(Put("foq8", "bar8"));
ASSERT_OK(Put("fpc", "2"));
dbfull()->Flush(FlushOptions());
{
// BF is fixed:2 now
std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
// the first and last BF are checked
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
4 + iteration * 3);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
// only last BF is checked and not found
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
5 + iteration * 3);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
}
// iter_old can only see the first SST, so checked plus 1
ASSERT_EQ(CountIter(iter_old, "foo"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
6 + iteration * 3);
// iter was created after the first setoptions call so only full filter
// will check the filter
ASSERT_EQ(CountIter(iter, "foo"), 2);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
6 + iteration * 4);
{
// keys in all three SSTs are visible to iterator
// The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
// so +2 for checked counter
std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_all, "foo"), 9);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
7 + iteration * 5);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
8 + iteration * 5);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
}
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
{
std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_all, "foo"), 6);
// all three SST are checked because the current options has the same as
// the remaining SST (capped:3)
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
9 + iteration * 7);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
10 + iteration * 7);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
}
// TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
iteration++;
}
}
// Create a new column family in a running DB, change prefix_extractor
// dynamically, verify the iterator created on the new column family behaves
// as expected
TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.disable_auto_compactions = true;
options.statistics = CreateDBStatistics();
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
ReadOptions read_options;
read_options.prefix_same_as_start = true;
// create a new CF and set prefix_extractor dynamically
options.prefix_extractor.reset(NewCappedPrefixTransform(3));
CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
ASSERT_EQ(0,
strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
ASSERT_OK(Put(2, "foo3", "bar3"));
ASSERT_OK(Put(2, "foo4", "bar4"));
ASSERT_OK(Put(2, "foo5", "bar5"));
ASSERT_OK(Put(2, "foq6", "bar6"));
ASSERT_OK(Put(2, "fpq7", "bar7"));
dbfull()->Flush(FlushOptions());
{
std::unique_ptr<Iterator> iter(
db_->NewIterator(read_options, handles_[2]));
ASSERT_EQ(CountIter(iter, "foo"), 3);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(
dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
ASSERT_EQ(0,
strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
"rocksdb.FixedPrefix.2"));
{
std::unique_ptr<Iterator> iter(
db_->NewIterator(read_options, handles_[2]));
ASSERT_EQ(CountIter(iter, "foo"), 4);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
dbfull()->DestroyColumnFamilyHandle(handles_[2]);
handles_[2] = nullptr;
ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
dbfull()->DestroyColumnFamilyHandle(handles_[1]);
handles_[1] = nullptr;
iteration++;
}
}
// Verify it's possible to change prefix_extractor at runtime and iterators
// behaves as expected
TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
int iteration = 0;
for (bool use_block_based_builder : {true, false}) {
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.disable_auto_compactions = true;
options.statistics = CreateDBStatistics();
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
ASSERT_OK(Put("foo2", "bar2"));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put("foo1", "bar1"));
ASSERT_OK(Put("fpa", "0"));
dbfull()->Flush(FlushOptions());
ASSERT_OK(Put("foo3", "bar3"));
ASSERT_OK(Put("foo4", "bar4"));
ASSERT_OK(Put("foo5", "bar5"));
ASSERT_OK(Put("fpb", "1"));
dbfull()->Flush(FlushOptions());
ASSERT_OK(Put("foo6", "bar6"));
ASSERT_OK(Put("foo7", "bar7"));
ASSERT_OK(Put("foo8", "bar8"));
ASSERT_OK(Put("fpc", "2"));
dbfull()->Flush(FlushOptions());
ReadOptions read_options;
read_options.prefix_same_as_start = true;
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter, "foo"), 12);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
ASSERT_EQ(CountIter(iter_old, "foo"), 12);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
// "fp*" should be skipped
ASSERT_EQ(CountIter(iter, "foo"), 9);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
}
// iterator created before should not be affected and see all keys
ASSERT_EQ(CountIter(iter_old, "foo"), 12);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
ASSERT_EQ(CountIter(iter_old, "abc"), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
iteration++;
}
}
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
} // namespace rocksdb } // namespace rocksdb

@ -4607,180 +4607,6 @@ TEST_F(DBTest, FileCreationRandomFailure) {
} }
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
int CountIter(Iterator* iter, const Slice& key) {
int count = 0;
for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
iter->Next()) {
count++;
}
return count;
}
// Create multiple SST files each with a different prefix_extractor config,
// verify iterators can read all SST files using the latest config.
TEST_F(DBTest, DynamicBloomFilterMultipleSST) {
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.disable_auto_compactions = true;
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
ReadOptions read_options;
read_options.prefix_same_as_start = true;
// first SST with fixed:1 BF
ASSERT_OK(Put("foo2", "bar2"));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put("foq1", "bar1"));
ASSERT_OK(Put("fpa", "0"));
dbfull()->Flush(FlushOptions());
Iterator* iter_old = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter_old, "foo"), 4);
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
Iterator* iter = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter, "foo"), 2);
// second SST with capped:3 BF
ASSERT_OK(Put("foo3", "bar3"));
ASSERT_OK(Put("foo4", "bar4"));
ASSERT_OK(Put("foq5", "bar5"));
ASSERT_OK(Put("fpb", "1"));
dbfull()->Flush(FlushOptions());
// BF is cappped:3 now
Iterator* iter_tmp = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
delete iter_tmp;
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.FixedPrefix.2"));
// third SST with fixed:2 BF
ASSERT_OK(Put("foo6", "bar6"));
ASSERT_OK(Put("foo7", "bar7"));
ASSERT_OK(Put("foq8", "bar8"));
ASSERT_OK(Put("fpc", "2"));
dbfull()->Flush(FlushOptions());
// BF is fixed:2 now
iter_tmp = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
delete iter_tmp;
// TODO(Zhongyi): verify existing iterator cannot see newly inserted keys
ASSERT_EQ(CountIter(iter_old, "foo"), 4);
ASSERT_EQ(CountIter(iter, "foo"), 2);
delete iter;
delete iter_old;
// keys in all three SSTs are visible to iterator
Iterator* iter_all = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter_all, "foo"), 9);
delete iter_all;
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
iter_all = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter_all, "foo"), 6);
delete iter_all;
// TODO(Zhongyi): add test for cases where certain SST are skipped
// Also verify BF related counters like BLOOM_FILTER_USEFUL
}
// Create a new column family in a running DB, change prefix_extractor
// dynamically, verify the iterator created on the new column family behaves
// as expected
TEST_F(DBTest, DynamicBloomFilterNewColumnFamily) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.disable_auto_compactions = true;
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
CreateAndReopenWithCF({"pikachu"}, options);
ReadOptions read_options;
read_options.prefix_same_as_start = true;
// create a new CF and set prefix_extractor dynamically
options.prefix_extractor.reset(NewCappedPrefixTransform(3));
CreateColumnFamilies({"ramen_dojo"}, options);
ASSERT_EQ(0,
strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
ASSERT_OK(Put(2, "foo3", "bar3"));
ASSERT_OK(Put(2, "foo4", "bar4"));
ASSERT_OK(Put(2, "foo5", "bar5"));
ASSERT_OK(Put(2, "foq6", "bar6"));
ASSERT_OK(Put(2, "fpq7", "bar7"));
dbfull()->Flush(FlushOptions());
Iterator* iter = db_->NewIterator(read_options, handles_[2]);
ASSERT_EQ(CountIter(iter, "foo"), 3);
delete iter;
ASSERT_OK(
dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
ASSERT_EQ(0,
strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
"rocksdb.FixedPrefix.2"));
iter = db_->NewIterator(read_options, handles_[2]);
ASSERT_EQ(CountIter(iter, "foo"), 4);
delete iter;
}
// Verify it's possible to change prefix_extractor at runtime and iterators
// behaves as expected
TEST_F(DBTest, DynamicBloomFilterOptions) {
Options options;
options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.disable_auto_compactions = true;
// Enable prefix bloom for SST files
BlockBasedTableOptions table_options;
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
ASSERT_OK(Put("foo2", "bar2"));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put("foo1", "bar1"));
ASSERT_OK(Put("fpa", "0"));
dbfull()->Flush(FlushOptions());
ASSERT_OK(Put("foo3", "bar3"));
ASSERT_OK(Put("foo4", "bar4"));
ASSERT_OK(Put("foo5", "bar5"));
ASSERT_OK(Put("fpb", "1"));
dbfull()->Flush(FlushOptions());
ASSERT_OK(Put("foo6", "bar6"));
ASSERT_OK(Put("foo7", "bar7"));
ASSERT_OK(Put("foo8", "bar8"));
ASSERT_OK(Put("fpc", "2"));
dbfull()->Flush(FlushOptions());
ReadOptions read_options;
read_options.prefix_same_as_start = true;
Iterator* iter = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter, "foo"), 12);
delete iter;
Iterator* iter_old = db_->NewIterator(read_options);
ASSERT_EQ(CountIter(iter_old, "foo"), 12);
ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
"rocksdb.CappedPrefix.3"));
iter = db_->NewIterator(read_options);
// "fp*" should be skipped
ASSERT_EQ(CountIter(iter, "foo"), 9);
delete iter;
// iterator created before should not be affected and see all keys
ASSERT_EQ(CountIter(iter_old, "foo"), 12);
delete iter_old;
}
TEST_F(DBTest, DynamicMiscOptions) { TEST_F(DBTest, DynamicMiscOptions) {
// Test max_sequential_skip_in_iterations // Test max_sequential_skip_in_iterations

@ -212,6 +212,10 @@ class SamePrefixTransform : public SliceTransform {
virtual bool InRange(const Slice& dst) const override { virtual bool InRange(const Slice& dst) const override {
return dst == prefix_; return dst == prefix_;
} }
virtual bool FullLengthEnabled(size_t* /*len*/) const override {
return false;
}
}; };
} // namespace } // namespace

@ -68,6 +68,12 @@ class Comparator {
// if it is a wrapped comparator, may return the root one. // if it is a wrapped comparator, may return the root one.
// return itself it is not wrapped. // return itself it is not wrapped.
virtual const Comparator* GetRootComparator() const { return this; } virtual const Comparator* GetRootComparator() const { return this; }
// given two keys, determine if t is the successor of s
virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
const Slice& /*t*/) const {
return false;
}
}; };
// Return a builtin comparator that uses lexicographic byte-wise // Return a builtin comparator that uses lexicographic byte-wise

@ -1015,7 +1015,7 @@ struct ReadOptions {
// can returns entries. Once the bound is reached, Valid() will be false. // can returns entries. Once the bound is reached, Valid() will be false.
// "iterate_upper_bound" is exclusive ie the bound value is // "iterate_upper_bound" is exclusive ie the bound value is
// not a valid entry. If iterator_extractor is not null, the Seek target // not a valid entry. If iterator_extractor is not null, the Seek target
// and iterator_upper_bound need to have the same prefix. // and iterate_upper_bound need to have the same prefix.
// This is because ordering is not guaranteed outside of prefix domain. // This is because ordering is not guaranteed outside of prefix domain.
// //
// Default: nullptr // Default: nullptr

@ -60,6 +60,11 @@ class SliceTransform {
// This is currently not used and remains here for backward compatibility. // This is currently not used and remains here for backward compatibility.
virtual bool InRange(const Slice& /*dst*/) const { return false; } virtual bool InRange(const Slice& /*dst*/) const { return false; }
// Some SliceTransform will have a full length which can be used to
// determine if two keys are consecuitive. Can be disabled by always
// returning 0
virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
// Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
// //
// This function is not used by RocksDB, but for users. If users pass // This function is not used by RocksDB, but for users. If users pass

@ -379,6 +379,7 @@ bool ParseStructOptions(
} }
return true; return true;
} }
} // anonymouse namespace
bool ParseSliceTransformHelper( bool ParseSliceTransformHelper(
const std::string& kFixedPrefixName, const std::string& kCappedPrefixName, const std::string& kFixedPrefixName, const std::string& kCappedPrefixName,
@ -434,7 +435,6 @@ bool ParseSliceTransform(
// SliceTransforms here. // SliceTransforms here.
return false; return false;
} }
} // anonymouse namespace
bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
const std::string& value) { const std::string& value) {

@ -130,6 +130,10 @@ Status GetColumnFamilyOptionsFromMapInternal(
std::vector<std::string>* unsupported_options_names = nullptr, std::vector<std::string>* unsupported_options_names = nullptr,
bool ignore_unknown_options = false); bool ignore_unknown_options = false);
bool ParseSliceTransform(
const std::string& value,
std::shared_ptr<const SliceTransform>* slice_transform);
extern Status StringToMap( extern Status StringToMap(
const std::string& opts_str, const std::string& opts_str,
std::unordered_map<std::string, std::string>* opts_map); std::unordered_map<std::string, std::string>* opts_map);

@ -249,8 +249,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
index_block_->NewIterator(icomparator_, index_block_->NewIterator(icomparator_,
icomparator_->user_comparator(), nullptr, icomparator_->user_comparator(), nullptr,
true, nullptr, index_key_includes_seq_), true, nullptr, index_key_includes_seq_),
false, false, true, /* prefix_extractor */ nullptr, kIsIndex,
/* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_); index_key_includes_seq_);
} }
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
// on-stack BlockIter while the state is on heap. Currentlly it assumes // on-stack BlockIter while the state is on heap. Currentlly it assumes
@ -824,6 +824,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
ROCKS_LOG_ERROR(rep->ioptions.info_log, ROCKS_LOG_ERROR(rep->ioptions.info_log,
"Cannot find Properties block from file."); "Cannot find Properties block from file.");
} }
#ifndef ROCKSDB_LITE
if (rep->table_properties) {
ParseSliceTransform(rep->table_properties->prefix_extractor_name,
&(rep->table_prefix_extractor));
}
#endif // ROCKSDB_LITE
// Read the compression dictionary meta block // Read the compression dictionary meta block
bool found_compression_dict; bool found_compression_dict;
@ -898,6 +904,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
rep->ioptions.info_log); rep->ioptions.info_log);
} }
bool need_upper_bound_check =
PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor);
// prefetch both index and filters, down to all partitions // prefetch both index and filters, down to all partitions
const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
@ -932,14 +941,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
// Hack: Call NewIndexIterator() to implicitly add index to the // Hack: Call NewIndexIterator() to implicitly add index to the
// block_cache // block_cache
CachableEntry<IndexReader> index_entry; CachableEntry<IndexReader> index_entry;
bool prefix_extractor_changed = false;
// check prefix_extractor match only if hash based index is used // check prefix_extractor match only if hash based index is used
if (rep->index_type == BlockBasedTableOptions::kHashSearch) { bool disable_prefix_seek =
prefix_extractor_changed = PrefixExtractorChanged( rep->index_type == BlockBasedTableOptions::kHashSearch &&
rep->table_properties.get(), prefix_extractor); need_upper_bound_check;
}
unique_ptr<InternalIterator> iter(new_table->NewIndexIterator( unique_ptr<InternalIterator> iter(new_table->NewIndexIterator(
ReadOptions(), prefix_extractor_changed, nullptr, &index_entry)); ReadOptions(), disable_prefix_seek, nullptr, &index_entry));
s = iter->status(); s = iter->status();
if (s.ok()) { if (s.ok()) {
// This is the first call to NewIndexIterator() since we're in Open(). // This is the first call to NewIndexIterator() since we're in Open().
@ -958,9 +965,11 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
} }
if (s.ok() && prefetch_filter) { if (s.ok() && prefetch_filter) {
// Hack: Call GetFilter() to implicitly add filter to the block_cache // Hack: Call GetFilter() to implicitly add filter to the block_cache
auto filter_entry = new_table->GetFilter(prefix_extractor); auto filter_entry =
new_table->GetFilter(rep->table_prefix_extractor.get());
if (filter_entry.value != nullptr && prefetch_all) { if (filter_entry.value != nullptr && prefetch_all) {
filter_entry.value->CacheDependencies(pin_all, prefix_extractor); filter_entry.value->CacheDependencies(
pin_all, rep->table_prefix_extractor.get());
} }
// if pin_filter is true then save it in rep_->filter_entry; it will be // if pin_filter is true then save it in rep_->filter_entry; it will be
// released in the destructor only, hence it will be pinned in the // released in the destructor only, hence it will be pinned in the
@ -990,14 +999,14 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
// Set filter block // Set filter block
if (rep->filter_policy) { if (rep->filter_policy) {
const bool is_a_filter_partition = true; const bool is_a_filter_partition = true;
auto filter = auto filter = new_table->ReadFilter(
new_table->ReadFilter(prefetch_buffer.get(), rep->filter_handle, prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition,
!is_a_filter_partition, prefix_extractor); rep->table_prefix_extractor.get());
rep->filter.reset(filter); rep->filter.reset(filter);
// Refer to the comment above about paritioned indexes always being // Refer to the comment above about paritioned indexes always being
// cached // cached
if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
filter->CacheDependencies(pin_all, prefix_extractor); filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get());
} }
} }
} else { } else {
@ -1784,20 +1793,28 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
// Otherwise, this method guarantees no I/O will be incurred. // Otherwise, this method guarantees no I/O will be incurred.
// //
// REQUIRES: this method shouldn't be called while the DB lock is held. // REQUIRES: this method shouldn't be called while the DB lock is held.
bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, bool BlockBasedTable::PrefixMayMatch(
const SliceTransform* prefix_extractor) { const Slice& internal_key, const ReadOptions& read_options,
const SliceTransform* options_prefix_extractor,
const bool need_upper_bound_check) {
if (!rep_->filter_policy) { if (!rep_->filter_policy) {
return true; return true;
} }
assert(prefix_extractor != nullptr); const SliceTransform* prefix_extractor;
if (rep_->table_prefix_extractor == nullptr) {
if (need_upper_bound_check) {
return true;
}
prefix_extractor = options_prefix_extractor;
} else {
prefix_extractor = rep_->table_prefix_extractor.get();
}
auto user_key = ExtractUserKey(internal_key); auto user_key = ExtractUserKey(internal_key);
if (!prefix_extractor->InDomain(user_key)) { if (!prefix_extractor->InDomain(user_key)) {
return true; return true;
} }
assert(rep_->table_properties->prefix_extractor_name.compare(
prefix_extractor->Name()) == 0);
auto prefix = prefix_extractor->Transform(user_key);
bool may_match = true; bool may_match = true;
Status s; Status s;
@ -1805,12 +1822,23 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key,
// First, try check with full filter // First, try check with full filter
auto filter_entry = GetFilter(prefix_extractor); auto filter_entry = GetFilter(prefix_extractor);
FilterBlockReader* filter = filter_entry.value; FilterBlockReader* filter = filter_entry.value;
bool filter_checked = true;
if (filter != nullptr) { if (filter != nullptr) {
if (!filter->IsBlockBased()) { if (!filter->IsBlockBased()) {
const Slice* const const_ikey_ptr = &internal_key; const Slice* const const_ikey_ptr = &internal_key;
may_match = filter->PrefixMayMatch(prefix, prefix_extractor, kNotValid, may_match = filter->RangeMayExist(
false, const_ikey_ptr); read_options.iterate_upper_bound, user_key, prefix_extractor,
rep_->internal_comparator.user_comparator(), const_ikey_ptr,
&filter_checked, need_upper_bound_check);
} else { } else {
// if prefix_extractor changed for block based filter, skip filter
if (need_upper_bound_check) {
if (!rep_->filter_entry.IsSet()) {
filter_entry.Release(rep_->table_options.block_cache.get());
}
return true;
}
auto prefix = prefix_extractor->Transform(user_key);
InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
auto internal_prefix = internal_key_prefix.Encode(); auto internal_prefix = internal_key_prefix.Encode();
@ -1823,9 +1851,9 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key,
// Then, try find it within each block // Then, try find it within each block
// we already know prefix_extractor and prefix_extractor_name must match // we already know prefix_extractor and prefix_extractor_name must match
// because `CheckPrefixMayMatch` first checks `check_filter_ == true` // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
bool prefix_extractor_changed = false;
unique_ptr<InternalIterator> iiter( unique_ptr<InternalIterator> iiter(
NewIndexIterator(no_io_read_options, prefix_extractor_changed)); NewIndexIterator(no_io_read_options,
/* need_upper_bound_check */ false));
iiter->Seek(internal_prefix); iiter->Seek(internal_prefix);
if (!iiter->Valid()) { if (!iiter->Valid()) {
@ -1866,10 +1894,12 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key,
} }
} }
Statistics* statistics = rep_->ioptions.statistics; if (filter_checked) {
RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); Statistics* statistics = rep_->ioptions.statistics;
if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); if (!may_match) {
RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
}
} }
// if rep_->filter_entry is not set, we should call Release(); otherwise // if rep_->filter_entry is not set, we should call Release(); otherwise
@ -1878,12 +1908,11 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key,
if (!rep_->filter_entry.IsSet()) { if (!rep_->filter_entry.IsSet()) {
filter_entry.Release(rep_->table_options.block_cache.get()); filter_entry.Release(rep_->table_options.block_cache.get());
} }
return may_match; return may_match;
} }
void BlockBasedTableIterator::Seek(const Slice& target) { void BlockBasedTableIterator::Seek(const Slice& target) {
if (!CheckPrefixMayMatch(target, prefix_extractor_)) { if (!CheckPrefixMayMatch(target)) {
ResetDataIter(); ResetDataIter();
return; return;
} }
@ -1911,7 +1940,7 @@ void BlockBasedTableIterator::Seek(const Slice& target) {
} }
void BlockBasedTableIterator::SeekForPrev(const Slice& target) { void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
if (!CheckPrefixMayMatch(target, prefix_extractor_)) { if (!CheckPrefixMayMatch(target)) {
ResetDataIter(); ResetDataIter();
return; return;
} }
@ -2101,7 +2130,7 @@ void BlockBasedTableIterator::FindKeyBackward() {
InternalIterator* BlockBasedTable::NewIterator( InternalIterator* BlockBasedTable::NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor, const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, bool for_compaction) { Arena* arena, bool skip_filters, bool for_compaction) {
bool prefix_extractor_changed = bool need_upper_bound_check =
PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
const bool kIsNotIndex = false; const bool kIsNotIndex = false;
if (arena == nullptr) { if (arena == nullptr) {
@ -2109,21 +2138,21 @@ InternalIterator* BlockBasedTable::NewIterator(
this, read_options, rep_->internal_comparator, this, read_options, rep_->internal_comparator,
NewIndexIterator( NewIndexIterator(
read_options, read_options,
prefix_extractor_changed && need_upper_bound_check &&
rep_->index_type == BlockBasedTableOptions::kHashSearch), rep_->index_type == BlockBasedTableOptions::kHashSearch),
!skip_filters && !read_options.total_order_seek && !skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr && !prefix_extractor_changed, prefix_extractor != nullptr,
prefix_extractor, kIsNotIndex, true /*key_includes_seq*/, need_upper_bound_check, prefix_extractor, kIsNotIndex,
for_compaction); true /*key_includes_seq*/, for_compaction);
} else { } else {
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
return new (mem) BlockBasedTableIterator( return new (mem) BlockBasedTableIterator(
this, read_options, rep_->internal_comparator, this, read_options, rep_->internal_comparator,
NewIndexIterator(read_options, prefix_extractor_changed), NewIndexIterator(read_options, need_upper_bound_check),
!skip_filters && !read_options.total_order_seek && !skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr && !prefix_extractor_changed, prefix_extractor != nullptr,
prefix_extractor, kIsNotIndex, true /*key_includes_seq*/, need_upper_bound_check, prefix_extractor, kIsNotIndex,
for_compaction); true /*key_includes_seq*/, for_compaction);
} }
} }
@ -2210,14 +2239,14 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
BlockIter iiter_on_stack; BlockIter iiter_on_stack;
// if prefix_extractor found in block differs from options, disable // if prefix_extractor found in block differs from options, disable
// BlockPrefixIndex. Only do this check when index_type is kHashSearch. // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
bool prefix_extractor_changed = false; bool need_upper_bound_check = false;
if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
prefix_extractor_changed = PrefixExtractorChanged( need_upper_bound_check = PrefixExtractorChanged(
rep_->table_properties.get(), prefix_extractor); rep_->table_properties.get(), prefix_extractor);
} }
auto iiter = NewIndexIterator(read_options, prefix_extractor_changed, auto iiter =
&iiter_on_stack, /* index_entry */ nullptr, NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
get_context); /* index_entry */ nullptr, get_context);
std::unique_ptr<InternalIterator> iiter_unique_ptr; std::unique_ptr<InternalIterator> iiter_unique_ptr;
if (iiter != &iiter_on_stack) { if (iiter != &iiter_on_stack) {
iiter_unique_ptr.reset(iiter); iiter_unique_ptr.reset(iiter);
@ -2479,7 +2508,7 @@ Status BlockBasedTable::CreateIndexReader(
// kHashSearch requires non-empty prefix_extractor but bypass checking // kHashSearch requires non-empty prefix_extractor but bypass checking
// prefix_extractor here since we have no access to MutableCFOptions. // prefix_extractor here since we have no access to MutableCFOptions.
// Add prefix_extractor_changed flag in BlockBasedTable::NewIndexIterator. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
// If prefix_extractor does not match prefix_extractor_name from table // If prefix_extractor does not match prefix_extractor_name from table
// properties, turn off Hash Index by setting total_order_seek to true // properties, turn off Hash Index by setting total_order_seek to true

@ -95,7 +95,9 @@ class BlockBasedTable : public TableReader {
bool skip_filters = false, int level = -1); bool skip_filters = false, int level = -1);
bool PrefixMayMatch(const Slice& internal_key, bool PrefixMayMatch(const Slice& internal_key,
const SliceTransform* prefix_extractor = nullptr); const ReadOptions& read_options,
const SliceTransform* options_prefix_extractor,
const bool need_upper_bound_check);
// Returns a new iterator over the table contents. // Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must // The result of NewIterator() is initially invalid (caller must
@ -277,7 +279,7 @@ class BlockBasedTable : public TableReader {
// 3. We disallowed any io to be performed, that is, read_options == // 3. We disallowed any io to be performed, that is, read_options ==
// kBlockCacheTier // kBlockCacheTier
InternalIterator* NewIndexIterator( InternalIterator* NewIndexIterator(
const ReadOptions& read_options, bool prefix_extractor_changed = false, const ReadOptions& read_options, bool need_upper_bound_check = false,
BlockIter* input_iter = nullptr, BlockIter* input_iter = nullptr,
CachableEntry<IndexReader>* index_entry = nullptr, CachableEntry<IndexReader>* index_entry = nullptr,
GetContext* get_context = nullptr); GetContext* get_context = nullptr);
@ -481,6 +483,7 @@ struct BlockBasedTable::Rep {
// and compatible with existing code, we introduce a wrapper that allows // and compatible with existing code, we introduce a wrapper that allows
// block to extract prefix without knowing if a key is internal or not. // block to extract prefix without knowing if a key is internal or not.
unique_ptr<SliceTransform> internal_prefix_transform; unique_ptr<SliceTransform> internal_prefix_transform;
std::shared_ptr<const SliceTransform> table_prefix_extractor;
// only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
// true or in all levels when pin_top_level_index_and_filter is set in // true or in all levels when pin_top_level_index_and_filter is set in
@ -515,6 +518,7 @@ class BlockBasedTableIterator : public InternalIterator {
const ReadOptions& read_options, const ReadOptions& read_options,
const InternalKeyComparator& icomp, const InternalKeyComparator& icomp,
InternalIterator* index_iter, bool check_filter, InternalIterator* index_iter, bool check_filter,
bool need_upper_bound_check,
const SliceTransform* prefix_extractor, bool is_index, const SliceTransform* prefix_extractor, bool is_index,
bool key_includes_seq = true, bool key_includes_seq = true,
bool for_compaction = false) bool for_compaction = false)
@ -525,10 +529,11 @@ class BlockBasedTableIterator : public InternalIterator {
pinned_iters_mgr_(nullptr), pinned_iters_mgr_(nullptr),
block_iter_points_to_real_block_(false), block_iter_points_to_real_block_(false),
check_filter_(check_filter), check_filter_(check_filter),
need_upper_bound_check_(need_upper_bound_check),
prefix_extractor_(prefix_extractor),
is_index_(is_index), is_index_(is_index),
key_includes_seq_(key_includes_seq), key_includes_seq_(key_includes_seq),
for_compaction_(for_compaction), for_compaction_(for_compaction) {}
prefix_extractor_(prefix_extractor) {}
~BlockBasedTableIterator() { delete index_iter_; } ~BlockBasedTableIterator() { delete index_iter_; }
@ -575,9 +580,10 @@ class BlockBasedTableIterator : public InternalIterator {
block_iter_points_to_real_block_; block_iter_points_to_real_block_;
} }
bool CheckPrefixMayMatch(const Slice& ikey, bool CheckPrefixMayMatch(const Slice& ikey) {
const SliceTransform* prefix_extractor = nullptr) { if (check_filter_ &&
if (check_filter_ && !table_->PrefixMayMatch(ikey, prefix_extractor)) { !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
need_upper_bound_check_)) {
// TODO remember the iterator is invalidated because of prefix // TODO remember the iterator is invalidated because of prefix
// match. This can avoid the upper level file iterator to falsely // match. This can avoid the upper level file iterator to falsely
// believe the position is the end of the SST file and move to // believe the position is the end of the SST file and move to
@ -621,6 +627,9 @@ class BlockBasedTableIterator : public InternalIterator {
bool block_iter_points_to_real_block_; bool block_iter_points_to_real_block_;
bool is_out_of_bound_ = false; bool is_out_of_bound_ = false;
bool check_filter_; bool check_filter_;
// TODO(Zhongyi): pick a better name
bool need_upper_bound_check_;
const SliceTransform* prefix_extractor_;
// If the blocks over which we iterate are index blocks // If the blocks over which we iterate are index blocks
bool is_index_; bool is_index_;
// If the keys in the blocks over which we iterate include 8 byte sequence // If the keys in the blocks over which we iterate include 8 byte sequence
@ -629,7 +638,6 @@ class BlockBasedTableIterator : public InternalIterator {
bool for_compaction_; bool for_compaction_;
// TODO use block offset instead // TODO use block offset instead
std::string prev_index_value_; std::string prev_index_value_;
const SliceTransform* prefix_extractor_;
static const size_t kInitReadaheadSize = 8 * 1024; static const size_t kInitReadaheadSize = 8 * 1024;
// Found that 256 KB readahead size provides the best performance, based on // Found that 256 KB readahead size provides the best performance, based on

@ -123,6 +123,17 @@ class FilterBlockReader {
virtual void CacheDependencies(bool /*pin*/, virtual void CacheDependencies(bool /*pin*/,
const SliceTransform* /*prefix_extractor*/) {} const SliceTransform* /*prefix_extractor*/) {}
virtual bool RangeMayExist(
const Slice* /*iterate_upper_bound*/, const Slice& user_key,
const SliceTransform* prefix_extractor,
const Comparator* /*comparator*/, const Slice* const const_ikey_ptr,
bool* filter_checked, bool /*need_upper_bound_check*/) {
*filter_checked = true;
Slice prefix = prefix_extractor->Transform(user_key);
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
const_ikey_ptr);
}
protected: protected:
bool whole_key_filtering_; bool whole_key_filtering_;

@ -98,6 +98,10 @@ FullFilterBlockReader::FullFilterBlockReader(
contents_(contents) { contents_(contents) {
assert(filter_bits_reader != nullptr); assert(filter_bits_reader != nullptr);
filter_bits_reader_.reset(filter_bits_reader); filter_bits_reader_.reset(filter_bits_reader);
if (prefix_extractor_ != nullptr) {
full_length_enabled_ =
prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_);
}
} }
FullFilterBlockReader::FullFilterBlockReader( FullFilterBlockReader::FullFilterBlockReader(
@ -153,4 +157,57 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) {
size_t FullFilterBlockReader::ApproximateMemoryUsage() const { size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
return contents_.size(); return contents_.size();
} }
bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,
const Slice& user_key, const SliceTransform* prefix_extractor,
const Comparator* comparator, const Slice* const const_ikey_ptr,
bool* filter_checked, bool need_upper_bound_check) {
if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
*filter_checked = false;
return true;
}
Slice prefix = prefix_extractor->Transform(user_key);
if (need_upper_bound_check &&
!IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
*filter_checked = false;
return true;
} else {
*filter_checked = true;
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
const_ikey_ptr);
}
}
bool FullFilterBlockReader::IsFilterCompatible(
const Slice* iterate_upper_bound, const Slice& prefix,
const Comparator* comparator) {
// Try to reuse the bloom filter in the SST table if prefix_extractor in
// mutable_cf_options has changed. If range [user_key, upper_bound) all
// share the same prefix then we may still be able to use the bloom filter.
if (iterate_upper_bound != nullptr && prefix_extractor_) {
if (!prefix_extractor_->InDomain(*iterate_upper_bound)) {
return false;
}
Slice upper_bound_xform =
prefix_extractor_->Transform(*iterate_upper_bound);
// first check if user_key and upper_bound all share the same prefix
if (!comparator->Equal(prefix, upper_bound_xform)) {
// second check if user_key's prefix is the immediate predecessor of
// upper_bound and have the same length. If so, we know for sure all
// keys in the range [user_key, upper_bound) share the same prefix.
// Also need to make sure upper_bound are full length to ensure
// correctness
if (!full_length_enabled_ ||
iterate_upper_bound->size() != prefix_extractor_full_length_ ||
!comparator->IsSameLengthImmediateSuccessor(prefix,
*iterate_upper_bound)) {
return false;
}
}
return true;
} else {
return false;
}
}
} // namespace rocksdb } // namespace rocksdb

@ -108,18 +108,27 @@ class FullFilterBlockReader : public FilterBlockReader {
uint64_t block_offset = kNotValid, const bool no_io = false, uint64_t block_offset = kNotValid, const bool no_io = false,
const Slice* const const_ikey_ptr = nullptr) override; const Slice* const const_ikey_ptr = nullptr) override;
virtual size_t ApproximateMemoryUsage() const override; virtual size_t ApproximateMemoryUsage() const override;
virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
const SliceTransform* prefix_extractor,
const Comparator* comparator,
const Slice* const const_ikey_ptr, bool* filter_checked,
bool need_upper_bound_check) override;
private: private:
const SliceTransform* prefix_extractor_; const SliceTransform* prefix_extractor_;
Slice contents_; Slice contents_;
std::unique_ptr<FilterBitsReader> filter_bits_reader_; std::unique_ptr<FilterBitsReader> filter_bits_reader_;
BlockContents block_contents_; BlockContents block_contents_;
std::unique_ptr<const char[]> filter_data_; std::unique_ptr<const char[]> filter_data_;
bool full_length_enabled_;
size_t prefix_extractor_full_length_;
// No copying allowed // No copying allowed
FullFilterBlockReader(const FullFilterBlockReader&); FullFilterBlockReader(const FullFilterBlockReader&);
bool MayMatch(const Slice& entry); bool MayMatch(const Slice& entry);
void operator=(const FullFilterBlockReader&); void operator=(const FullFilterBlockReader&);
bool IsFilterCompatible(const Slice* iterate_upper_bound,
const Slice& prefix, const Comparator* comparator);
}; };
} // namespace rocksdb } // namespace rocksdb

@ -692,6 +692,9 @@ class FixedOrLessPrefixTransform : public SliceTransform {
virtual bool InRange(const Slice& dst) const override { virtual bool InRange(const Slice& dst) const override {
return (dst.size() <= prefix_len_); return (dst.size() <= prefix_len_);
} }
virtual bool FullLengthEnabled(size_t* /*len*/) const override {
return false;
}
}; };
class HarnessTest : public testing::Test { class HarnessTest : public testing::Test {

@ -98,6 +98,32 @@ class BytewiseComparatorImpl : public Comparator {
} }
// *key is a run of 0xffs. Leave it alone. // *key is a run of 0xffs. Leave it alone.
} }
virtual bool IsSameLengthImmediateSuccessor(const Slice& s,
const Slice& t) const override {
if (s.size() != t.size() || s.size() == 0) {
return false;
}
size_t diff_ind = s.difference_offset(t);
// same slice
if (diff_ind >= s.size()) return false;
uint8_t byte_s = static_cast<uint8_t>(s[diff_ind]);
uint8_t byte_t = static_cast<uint8_t>(t[diff_ind]);
// first different byte must be consecutive, and remaining bytes must be
// 0xff for s and 0x00 for t
if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) {
for (size_t i = diff_ind + 1; i < s.size(); ++i) {
byte_s = static_cast<uint8_t>(s[i]);
byte_t = static_cast<uint8_t>(t[i]);
if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) {
return false;
}
}
return true;
} else {
return false;
}
}
}; };
class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {

@ -47,6 +47,11 @@ class FixedPrefixTransform : public SliceTransform {
return (dst.size() == prefix_len_); return (dst.size() == prefix_len_);
} }
virtual bool FullLengthEnabled(size_t* len) const override {
*len = prefix_len_;
return true;
}
virtual bool SameResultWhenAppended(const Slice& prefix) const override { virtual bool SameResultWhenAppended(const Slice& prefix) const override {
return InDomain(prefix); return InDomain(prefix);
} }
@ -80,6 +85,11 @@ class CappedPrefixTransform : public SliceTransform {
return (dst.size() <= cap_len_); return (dst.size() <= cap_len_);
} }
virtual bool FullLengthEnabled(size_t* len) const override {
*len = cap_len_;
return true;
}
virtual bool SameResultWhenAppended(const Slice& prefix) const override { virtual bool SameResultWhenAppended(const Slice& prefix) const override {
return prefix.size() >= cap_len_; return prefix.size() >= cap_len_;
} }

Loading…
Cancel
Save