diff --git a/HISTORY.md b/HISTORY.md index 58225553c..3a0e09ff1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ * Fix a race condition for cfd->log_number_ between manifest switch and memtable switch (PR 6249) when number of column families is greater than 1. * Fix a bug on fractional cascading index when multiple files at the same level contain the same smallest user key, and those user keys are for merge operands. In this case, Get() the exact key may miss some merge operands. * Delcare kHashSearch index type feature-incompatible with index_block_restart_interval larger than 1. +* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev(). ### New Features * It is now possible to enable periodic compactions for the base DB when using BlobDB. diff --git a/db/db_test2.cc b/db/db_test2.cc index 3efa879ca..7f724ac72 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -4300,6 +4300,64 @@ TEST_F(DBTest2, SameSmallestInSameLevel) { ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); } + +TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) { + // create a DB with block prefix index + BlockBasedTableOptions table_options; + Options options = CurrentOptions(); + table_options.block_size = 300; + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + Reopen(options); + + Random rnd(301); + std::string large_value = RandomString(&rnd, 500); + + ASSERT_OK(Put("a1", large_value)); + ASSERT_OK(Put("x1", large_value)); + ASSERT_OK(Put("y1", large_value)); + Flush(); + + { + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + iterator->SeekForPrev("x3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("x1", iterator->key().ToString()); + + iterator->SeekForPrev("a3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("a1", iterator->key().ToString()); + + iterator->SeekForPrev("y3"); + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ("y1", iterator->key().ToString()); + + // Query more than one non-existing prefix to cover the case both + // of empty hash bucket and hash bucket conflict. + iterator->SeekForPrev("b1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("c1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + + iterator->SeekForPrev("d1"); + // Result should be not valid or "a1". + if (iterator->Valid()) { + ASSERT_EQ("a1", iterator->key().ToString()); + } + } +} + } // namespace rocksdb #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 8fa3ff9b9..cfa042d6e 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -393,7 +393,15 @@ void IndexBlockIter::Seek(const Slice& target) { uint32_t index = 0; bool ok = false; if (prefix_index_) { - ok = PrefixSeek(target, &index); + bool prefix_may_exist = true; + ok = PrefixSeek(target, &index, &prefix_may_exist); + if (!prefix_may_exist) { + // This is to let the caller to distinguish between non-existing prefix, + // and when key is larger than the last key, which both set Valid() to + // false. + current_ = restarts_; + status_ = Status::NotFound(); + } } else if (value_delta_encoded_) { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); @@ -718,8 +726,12 @@ int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { // with a key >= target bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, - uint32_t right, uint32_t* index) { + uint32_t right, uint32_t* index, + bool* prefix_may_exist) { assert(left <= right); + assert(index); + assert(prefix_may_exist); + *prefix_may_exist = true; uint32_t left_bound = left; while (left <= right) { @@ -753,6 +765,7 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && CompareBlockKey(block_ids[left] - 1, target) > 0) { current_ = restarts_; + *prefix_may_exist = false; return false; } @@ -760,14 +773,45 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, return true; } else { assert(left > right); + + // If the next block key is larger than seek key, it is possible that + // no key shares the prefix with `target`, or all keys with the same + // prefix as `target` are smaller than prefix. In the latter case, + // we are mandated to set the position the same as the total order. + // In the latter case, either: + // (1) `target` falls into the range of the next block. In this case, + // we can place the iterator to the next block, or + // (2) `target` is larger than all block keys. In this case we can + // keep the iterator invalidate without setting `prefix_may_exist` + // to false. + // We might sometimes end up with setting the total order position + // while there is no key sharing the prefix as `target`, but it + // still follows the contract. + uint32_t right_index = block_ids[right]; + assert(right_index + 1 <= num_restarts_); + if (right_index + 1 < num_restarts_) { + if (CompareBlockKey(right_index + 1, target) >= 0) { + *index = right_index + 1; + return true; + } else { + // We have to set the flag here because we are not positioning + // the iterator to the total order position. + *prefix_may_exist = false; + } + } + // Mark iterator invalid current_ = restarts_; return false; } } -bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) { +bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, + bool* prefix_may_exist) { + assert(index); + assert(prefix_may_exist); assert(prefix_index_); + *prefix_may_exist = true; Slice seek_key = target; if (!key_includes_seq_) { seek_key = ExtractUserKey(target); @@ -777,9 +821,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) { if (num_blocks == 0) { current_ = restarts_; + *prefix_may_exist = false; return false; } else { - return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index); + assert(block_ids); + return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index, + prefix_may_exist); } } diff --git a/table/block_based/block.h b/table/block_based/block.h index 73c21b465..c8580e976 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -539,6 +539,13 @@ class IndexBlockIter final : public BlockIter { } } + // IndexBlockIter follows a different contract for prefix iterator + // from data iterators. + // If prefix of the seek key `target` exists in the file, it must + // return the same result as total order seek. + // If the prefix of `target` doesn't exist in the file, it can either + // return the result of total order seek, or set both of Valid() = false + // and status() = NotFound(). virtual void Seek(const Slice& target) override; virtual void SeekForPrev(const Slice&) override { @@ -595,9 +602,16 @@ class IndexBlockIter final : public BlockIter { std::unique_ptr global_seqno_state_; - bool PrefixSeek(const Slice& target, uint32_t* index); + // Set *prefix_may_exist to false if no key possibly share the same prefix + // as `target`. If not set, the result position should be the same as total + // order Seek. + bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist); + // Set *prefix_may_exist to false if no key can possibly share the same + // prefix as `target`. If not set, the result position should be the same + // as total order seek. bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, - uint32_t left, uint32_t right, uint32_t* index); + uint32_t left, uint32_t right, uint32_t* index, + bool* prefix_may_exist); inline int CompareBlockKey(uint32_t block_index, const Slice& target); inline int Compare(const Slice& a, const Slice& b) const { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index dbed36b91..dca08d2f4 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2901,12 +2901,22 @@ void BlockBasedTableIterator::SeekForPrev( index_iter_->Seek(target); if (!index_iter_->Valid()) { - if (!index_iter_->status().ok()) { + auto seek_status = index_iter_->status(); + // Check for IO error + if (!seek_status.IsNotFound() && !seek_status.ok()) { ResetDataIter(); return; } - index_iter_->SeekToLast(); + // With prefix index, Seek() returns NotFound if the prefix doesn't exist + if (seek_status.IsNotFound()) { + // Any key less than the target is fine for prefix seek + ResetDataIter(); + return; + } else { + index_iter_->SeekToLast(); + } + // Check for IO error if (!index_iter_->Valid()) { ResetDataIter(); return; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 96f440ba6..f54c45fff 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -686,7 +686,8 @@ class BlockBasedTableIterator : public InternalIteratorBase { return block_iter_.value(); } Status status() const override { - if (!index_iter_->status().ok()) { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { return index_iter_->status(); } else if (block_iter_points_to_real_block_) { return block_iter_.status(); diff --git a/table/table_test.cc b/table/table_test.cc index 5ab19e9f9..f620a3f5e 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1857,7 +1857,8 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { auto key = prefixes[i] + "9"; index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode()); - ASSERT_OK(index_iter->status()); + ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound()); + ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid()); if (i == prefixes.size() - 1) { // last key ASSERT_TRUE(!index_iter->Valid()); @@ -1884,6 +1885,19 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0); } } + for (const auto& prefix : non_exist_prefixes) { + index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode()); + // regular_iter->Seek(prefix); + + ASSERT_OK(index_iter->status()); + // Seek to non-existing prefixes should yield either invalid, or a + // key with prefix greater than the target. + if (index_iter->Valid()) { + Slice ukey = ExtractUserKey(index_iter->key()); + Slice ukey_prefix = options.prefix_extractor->Transform(ukey); + ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0); + } + } c.ResetTableReader(); }