diff --git a/HISTORY.md b/HISTORY.md index 6e2d3a354..58fcabb1b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -31,6 +31,7 @@ * Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB. ### Performance Improvements * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance. +* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom. ## 6.4.0 (7/30/2019) ### Default Option Change diff --git a/db/db_test2.cc b/db/db_test2.cc index 8d8b2171b..fac84e3fe 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -4029,6 +4029,40 @@ TEST_F(DBTest2, PrefixBloomReseek) { delete iter; } +TEST_F(DBTest2, PrefixBloomFilteredOut) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Bloom filter is filterd out by f1. + // This is just one of several valid position following the contract. + // Postioning to ccc1 or ddd0 is also valid. This is just to validate + // the behavior of the current implementation. If underlying implementation + // changes, the test might fail here. + iter->Seek("bbb1"); + ASSERT_FALSE(iter->Valid()); + + delete iter; +} + #ifndef ROCKSDB_LITE TEST_F(DBTest2, RowCacheSnapshot) { Options options = CurrentOptions(); diff --git a/db/version_set.cc b/db/version_set.cc index d13d31559..e6553ddf8 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -934,7 +934,8 @@ class LevelIterator final : public InternalIterator { } private: - void SkipEmptyFileForward(); + // Return true if at least one invalid file is seen and skipped. + bool SkipEmptyFileForward(); void SkipEmptyFileBackward(); void SetFileIterator(InternalIterator* iter); void InitFileIterator(size_t new_file_index); @@ -1042,7 +1043,32 @@ void LevelIterator::Seek(const Slice& target) { if (file_iter_.iter() != nullptr) { file_iter_.Seek(target); } - SkipEmptyFileForward(); + if (SkipEmptyFileForward() && prefix_extractor_ != nullptr && + file_iter_.iter() != nullptr && file_iter_.Valid()) { + // We've skipped the file we initially positioned to. In the prefix + // seek case, it is likely that the file is skipped because of + // prefix bloom or hash, where more keys are skipped. We then check + // the current key and invalidate the iterator if the prefix is + // already passed. + // When doing prefix iterator seek, when keys for one prefix have + // been exhausted, it can jump to any key that is larger. Here we are + // enforcing a stricter contract than that, in order to make it easier for + // higher layers (merging and DB iterator) to reason the correctness: + // 1. Within the prefix, the result should be accurate. + // 2. If keys for the prefix is exhausted, it is either positioned to the + // next key after the prefix, or make the iterator invalid. + // A side benefit will be that it invalidates the iterator earlier so that + // the upper level merging iterator can merge fewer child iterators. + Slice target_user_key = ExtractUserKey(target); + Slice file_user_key = ExtractUserKey(file_iter_.key()); + if (prefix_extractor_->InDomain(target_user_key) && + (!prefix_extractor_->InDomain(file_user_key) || + user_comparator_.Compare( + prefix_extractor_->Transform(target_user_key), + prefix_extractor_->Transform(file_user_key)) != 0)) { + SetFileIterator(nullptr); + } + } CheckMayBeOutOfLowerBound(); } @@ -1096,25 +1122,28 @@ void LevelIterator::Prev() { SkipEmptyFileBackward(); } -void LevelIterator::SkipEmptyFileForward() { +bool LevelIterator::SkipEmptyFileForward() { + bool seen_empty_file = false; while (file_iter_.iter() == nullptr || (!file_iter_.Valid() && file_iter_.status().ok() && !file_iter_.iter()->IsOutOfBound())) { + seen_empty_file = true; // Move to next file if (file_index_ >= flevel_->num_files - 1) { // Already at the last file SetFileIterator(nullptr); - return; + break; } if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) { SetFileIterator(nullptr); - return; + break; } InitFileIterator(file_index_ + 1); if (file_iter_.iter() != nullptr) { file_iter_.SeekToFirst(); } } + return seen_empty_file; } void LevelIterator::SkipEmptyFileBackward() {