From dd29ad4223a3887c24a892119bc42492807aca86 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Tue, 7 Jul 2020 17:25:08 -0700 Subject: [PATCH] Separate internal and user key comparators in `BlockIter` (#6944) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Replace `BlockIter::comparator_` and `IndexBlockIter::user_comparator_wrapper_` with a concrete `UserComparatorWrapper` and `InternalKeyComparator`. The motivation for this change was the inconvenience of not knowing the concrete type of `BlockIter::comparator_`, which prevented calling specialized internal key comparison functions to optimize comparison of keys with global seqno applied. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6944 Test Plan: benchmark setup -- single file DBs, in-memory, no compression. "normal_db" created by regular flush; "ingestion_db" created by ingesting a file. Both DBs have same contents. ``` $ TEST_TMPDIR=/dev/shm/normal_db/ ./db_bench -benchmarks=fillrandom,compact -write_buffer_size=10485760000 -disable_auto_compactions=true -compression_type=none -num=1000000 $ ./ldb write_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ --compression_type=no --hex --create_if_missing < <(./sst_dump --command=scan --output_hex --file=/dev/shm/normal_db/dbbench/000007.sst | awk 'began {print "0x" substr($1, 2, length($1) - 2), "==>", "0x" $5} ; /^Sst file format: block-based/ {began=1}') $ ./ldb ingest_extern_sst ./tmp.sst --db=/dev/shm/ingestion_db/dbbench/ ``` benchmark run command: ``` $ TEST_TMPDIR=/dev/shm/$DB/ ./db_bench -benchmarks=seekrandom -seek_nexts=$SEEK_NEXT -use_existing_db=true -cache_index_and_filter_blocks=false -num=1000000 -cache_size=0 -threads=1 -reads=200000000 -mmap_read=1 -verify_checksum=false ``` results: perf improved marginally for ingestion_db and did not change significantly for normal_db: SEEK_NEXT | DB | code | ops/sec | % change -- | -- | -- | -- | -- 0 | normal_db | master | 350880 |   0 | normal_db | PR6944 | 351040 | 0.0 0 | ingestion_db | master | 343255 |   0 | ingestion_db | PR6944 | 349424 | 1.8 10 | normal_db | master | 218711 |   10 | normal_db | PR6944 | 217892 | -0.4 10 | ingestion_db | master | 220334 |   10 | ingestion_db | PR6944 | 226437 | 2.8 Reviewed By: pdillinger Differential Revision: D21924676 Pulled By: ajkr fbshipit-source-id: ea4288a2eefa8112eb6c651a671c1de18c12e538 --- HISTORY.md | 3 + db/dbformat.cc | 21 +- db/dbformat.h | 68 +++- .../block_based/binary_search_index_reader.cc | 2 +- table/block_based/block.cc | 126 ++++---- table/block_based/block.h | 305 ++++++++++-------- table/block_based/block_based_table_reader.cc | 17 +- table/block_based/block_test.cc | 46 ++- .../block_based/data_block_hash_index_test.cc | 6 +- table/block_based/hash_index_reader.cc | 2 +- table/block_based/partitioned_filter_block.cc | 10 +- table/block_based/partitioned_index_reader.cc | 6 +- table/meta_blocks.cc | 11 +- table/table_test.cc | 97 +++--- util/user_comparator_wrapper.h | 4 + 15 files changed, 403 insertions(+), 321 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9f128299a..0b011dc7e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -26,6 +26,9 @@ ### Bug Fixes * Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further. +### Performance Improvements +* Eliminate key copies for internal comparisons while accessing ingested block-based tables. + ## 6.11 (6/12/2020) ### Bug Fixes * Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true. diff --git a/db/dbformat.cc b/db/dbformat.cc index c72388c31..85411ed3e 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -26,12 +26,6 @@ namespace ROCKSDB_NAMESPACE { const ValueType kValueTypeForSeek = kTypeDeletionWithTimestamp; const ValueType kValueTypeForSeekForPrev = kTypeDeletion; -uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { - assert(seq <= kMaxSequenceNumber); - assert(IsExtendedValueType(t)); - return (seq << 8) | t; -} - EntryType GetEntryType(ValueType value_type) { switch (value_type) { case kTypeValue: @@ -62,14 +56,6 @@ bool ParseFullKey(const Slice& internal_key, FullKey* fkey) { return true; } -void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) { - *seq = packed >> 8; - *t = static_cast(packed & 0xff); - - assert(*seq <= kMaxSequenceNumber); - assert(IsExtendedValueType(*t)); -} - void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { result->append(key.user_key.data(), key.user_key.size()); PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); @@ -111,7 +97,12 @@ std::string InternalKey::DebugString(bool hex) const { return result; } -const char* InternalKeyComparator::Name() const { return name_.c_str(); } +const char* InternalKeyComparator::Name() const { + if (name_.empty()) { + return "rocksdb.anonymous.InternalKeyComparator"; + } + return name_.c_str(); +} int InternalKeyComparator::Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const { diff --git a/db/dbformat.h b/db/dbformat.h index dac6f3d46..6911d4297 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -124,11 +124,22 @@ inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { } // Pack a sequence number and a ValueType into a uint64_t -extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t); +inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { + assert(seq <= kMaxSequenceNumber); + assert(IsExtendedValueType(t)); + return (seq << 8) | t; +} // Given the result of PackSequenceAndType, store the sequence number in *seq // and the ValueType in *t. -extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t); +inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, + ValueType* t) { + *seq = packed >> 8; + *t = static_cast(packed & 0xff); + + assert(*seq <= kMaxSequenceNumber); + assert(IsExtendedValueType(*t)); +} EntryType GetEntryType(ValueType value_type); @@ -200,11 +211,22 @@ class InternalKeyComparator std::string name_; public: - explicit InternalKeyComparator(const Comparator* c) - : Comparator(c->timestamp_size()), - user_comparator_(c), - name_("rocksdb.InternalKeyComparator:" + - std::string(user_comparator_.Name())) {} + // `InternalKeyComparator`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + InternalKeyComparator() = default; + + // @param named If true, assign a name to this comparator based on the + // underlying comparator's name. This involves an allocation and copy in + // this constructor to precompute the result of `Name()`. To avoid this + // overhead, set `named` to false. In that case, `Name()` will return a + // generic name that is non-specific to the underlying comparator. + explicit InternalKeyComparator(const Comparator* c, bool named = true) + : Comparator(c->timestamp_size()), user_comparator_(c) { + if (named) { + name_ = "rocksdb.InternalKeyComparator:" + + std::string(user_comparator_.Name()); + } + } virtual ~InternalKeyComparator() {} virtual const char* Name() const override; @@ -221,6 +243,12 @@ class InternalKeyComparator int Compare(const InternalKey& a, const InternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + // In this `Compare()` overload, the sequence numbers provided in + // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` + // and `b`, respectively. To disable sequence number override(s), provide the + // value `kDisableGlobalSequenceNumber`. + int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b, + SequenceNumber b_global_seqno) const; virtual const Comparator* GetRootComparator() const override { return user_comparator_.GetRootComparator(); } @@ -677,6 +705,32 @@ inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, return r; } +inline int InternalKeyComparator::Compare(const Slice& a, + SequenceNumber a_global_seqno, + const Slice& b, + SequenceNumber b_global_seqno) const { + int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b)); + if (r == 0) { + uint64_t a_footer, b_footer; + if (a_global_seqno == kDisableGlobalSequenceNumber) { + a_footer = ExtractInternalKeyFooter(a); + } else { + a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a)); + } + if (b_global_seqno == kDisableGlobalSequenceNumber) { + b_footer = ExtractInternalKeyFooter(b); + } else { + b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b)); + } + if (a_footer > b_footer) { + r = -1; + } else if (a_footer < b_footer) { + r = +1; + } + } + return r; +} + // Wrap InternalKeyComparator as a comparator class for ParsedInternalKey. struct ParsedInternalKeyComparator { explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) diff --git a/table/block_based/binary_search_index_reader.cc b/table/block_based/binary_search_index_reader.cc index 8a2b72963..12469eadb 100644 --- a/table/block_based/binary_search_index_reader.cc +++ b/table/block_based/binary_search_index_reader.cc @@ -61,7 +61,7 @@ InternalIteratorBase* BinarySearchIndexReader::NewIterator( // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. auto it = index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), + internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true, index_has_first_key(), index_key_includes_seq(), index_value_is_full()); diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 8bf7e11d4..e619aa2c9 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -127,19 +127,15 @@ struct DecodeKeyV4 { } }; -void DataBlockIter::Next() { - ParseNextDataKey(); -} +void DataBlockIter::NextImpl() { ParseNextDataKey(); } -void DataBlockIter::NextOrReport() { +void DataBlockIter::NextOrReportImpl() { ParseNextDataKey(); } -void IndexBlockIter::Next() { - ParseNextIndexKey(); -} +void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } -void IndexBlockIter::Prev() { +void IndexBlockIter::PrevImpl() { assert(Valid()); // Scan backwards to a restart point before current_ const uint32_t original = current_; @@ -158,8 +154,8 @@ void IndexBlockIter::Prev() { } } -// Similar to IndexBlockIter::Prev but also caches the prev entries -void DataBlockIter::Prev() { +// Similar to IndexBlockIter::PrevImpl but also caches the prev entries +void DataBlockIter::PrevImpl() { assert(Valid()); assert(prev_entries_idx_ == -1 || @@ -173,25 +169,26 @@ void DataBlockIter::Prev() { prev_entries_[prev_entries_idx_]; const char* key_ptr = nullptr; + bool raw_key_cached; if (current_prev_entry.key_ptr != nullptr) { // The key is not delta encoded and stored in the data block key_ptr = current_prev_entry.key_ptr; - key_pinned_ = true; + raw_key_cached = false; } else { // The key is delta encoded and stored in prev_entries_keys_buff_ key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; - key_pinned_ = false; + raw_key_cached = true; } const Slice current_key(key_ptr, current_prev_entry.key_size); current_ = current_prev_entry.offset; - raw_key_.SetKey(current_key, false /* copy */); + // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience, + // not necessity. It is convenient since this class treats keys as pinned + // when `raw_key_` points to an outside buffer. So we cannot allow + // `raw_key_` point into Prev cache as it is a transient outside buffer + // (i.e., keys in it are not actually pinned). + raw_key_.SetKey(current_key, raw_key_cached /* copy */); value_ = current_prev_entry.value; - key_ = applied_key_.UpdateAndGetKey(); - // This is kind of odd in that applied_key_ may say the key is pinned while - // key_pinned_ ends up being false. That'll only happen when the key resides - // in a transient caching buffer. - key_pinned_ = key_pinned_ && applied_key_.IsKeyPinned(); return; } @@ -238,7 +235,7 @@ void DataBlockIter::Prev() { prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; } -void DataBlockIter::Seek(const Slice& target) { +void DataBlockIter::SeekImpl(const Slice& target) { Slice seek_key = target; PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet @@ -247,12 +244,12 @@ void DataBlockIter::Seek(const Slice& target) { uint32_t index = 0; bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - &skip_linear_scan, comparator_); + &skip_linear_scan); if (!ok) { return; } - FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan, comparator_); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } // Optimized Seek for point lookup for an internal key `target` @@ -286,7 +283,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { if (entry == kCollision) { // HashSeek not effective, falling back - Seek(target); + SeekImpl(target); return true; } @@ -330,7 +327,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // TODO(fwu): check the left and write boundary of the restart interval // to avoid linear seek a target key that is out of range. if (!ParseNextDataKey(limit) || - comparator_->Compare(applied_key_.UpdateAndGetKey(), target) >= 0) { + CompareCurrentKey(target) >= 0) { // we stop at the first potential matching user key. break; } @@ -355,18 +352,18 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { return true; } - if (user_comparator_->Compare(raw_key_.GetUserKey(), target_user_key) != 0) { + if (ucmp_wrapper_.Compare(raw_key_.GetUserKey(), target_user_key) != 0) { // the key is not in this block and cannot be at the next block either. return false; } // Here we are conservative and only support a limited set of cases - ValueType value_type = ExtractValueType(applied_key_.UpdateAndGetKey()); + ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); if (value_type != ValueType::kTypeValue && value_type != ValueType::kTypeDeletion && value_type != ValueType::kTypeSingleDeletion && value_type != ValueType::kTypeBlobIndex) { - Seek(target); + SeekImpl(target); return true; } @@ -374,14 +371,14 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { return true; } -void IndexBlockIter::Seek(const Slice& target) { +void IndexBlockIter::SeekImpl(const Slice& target) { TEST_SYNC_POINT("IndexBlockIter::Seek:0"); PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } Slice seek_key = target; - if (!key_includes_seq_) { + if (raw_key_.IsUserKey()) { seek_key = ExtractUserKey(target); } status_ = Status::OK(); @@ -403,19 +400,19 @@ void IndexBlockIter::Seek(const Slice& target) { skip_linear_scan = true; } else if (value_delta_encoded_) { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - &skip_linear_scan, comparator_); + &skip_linear_scan); } else { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - &skip_linear_scan, comparator_); + &skip_linear_scan); } if (!ok) { return; } - FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan, comparator_); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } -void DataBlockIter::SeekForPrev(const Slice& target) { +void DataBlockIter::SeekForPrevImpl(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); Slice seek_key = target; if (data_ == nullptr) { // Not init yet @@ -424,24 +421,23 @@ void DataBlockIter::SeekForPrev(const Slice& target) { uint32_t index = 0; bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, - &skip_linear_scan, comparator_); + &skip_linear_scan); if (!ok) { return; } - FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan, comparator_); + FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLast(); + SeekToLastImpl(); } else { - while (Valid() && - comparator_->Compare(applied_key_.UpdateAndGetKey(), seek_key) > 0) { - Prev(); + while (Valid() && CompareCurrentKey(seek_key) > 0) { + PrevImpl(); } } } -void DataBlockIter::SeekToFirst() { +void DataBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -449,7 +445,7 @@ void DataBlockIter::SeekToFirst() { ParseNextDataKey(); } -void DataBlockIter::SeekToFirstOrReport() { +void DataBlockIter::SeekToFirstOrReportImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -457,7 +453,7 @@ void DataBlockIter::SeekToFirstOrReport() { ParseNextDataKey(); } -void IndexBlockIter::SeekToFirst() { +void IndexBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -466,7 +462,7 @@ void IndexBlockIter::SeekToFirst() { ParseNextIndexKey(); } -void DataBlockIter::SeekToLast() { +void DataBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -476,7 +472,7 @@ void DataBlockIter::SeekToLast() { } } -void IndexBlockIter::SeekToLast() { +void IndexBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } @@ -526,8 +522,6 @@ bool DataBlockIter::ParseNextDataKey(const char* limit) { // This key share `shared` bytes with prev key, we need to decode it raw_key_.TrimAppend(shared, p, non_shared); } - key_ = applied_key_.UpdateAndGetKey(); - key_pinned_ = applied_key_.IsKeyPinned(); #ifndef NDEBUG if (global_seqno_ != kDisableGlobalSequenceNumber) { @@ -590,8 +584,6 @@ bool IndexBlockIter::ParseNextIndexKey() { // This key share `shared` bytes with prev key, we need to decode it raw_key_.TrimAppend(shared, p, non_shared); } - key_ = applied_key_.UpdateAndGetKey(); - key_pinned_ = applied_key_.IsKeyPinned(); value_ = Slice(p + non_shared, value_length); if (shared == 0) { while (restart_index_ + 1 < num_restarts_ && @@ -651,12 +643,12 @@ void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { template void BlockIter::FindKeyAfterBinarySeek(const Slice& target, uint32_t index, - bool skip_linear_scan, - const Comparator* comp) { + bool skip_linear_scan) { // SeekToRestartPoint() only does the lookup in the restart block. We need - // to follow it up with Next() to position the iterator at the restart key. + // to follow it up with NextImpl() to position the iterator at the restart + // key. SeekToRestartPoint(index); - Next(); + NextImpl(); if (!skip_linear_scan) { // Linear search (within restart block) for first key >= target @@ -672,14 +664,14 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, max_offset = port::kMaxUint32; } while (true) { - Next(); + NextImpl(); if (!Valid()) { break; } if (current_ == max_offset) { - assert(comp->Compare(applied_key_.UpdateAndGetKey(), target) > 0); + assert(CompareCurrentKey(target) > 0); break; - } else if (comp->Compare(applied_key_.UpdateAndGetKey(), target) >= 0) { + } else if (CompareCurrentKey(target) >= 0) { break; } } @@ -698,8 +690,7 @@ template template bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, uint32_t* index, - bool* skip_linear_scan, - const Comparator* comp) { + bool* skip_linear_scan) { assert(left <= right); if (restarts_ == 0) { // SST files dedicated to range tombstones are written with index blocks @@ -724,7 +715,7 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, } Slice mid_key(key_ptr, non_shared); raw_key_.SetKey(mid_key, false /* copy */); - int cmp = comp->Compare(applied_key_.UpdateAndGetKey(), target); + int cmp = CompareCurrentKey(target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all // blocks before "mid" are uninteresting. @@ -757,7 +748,7 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, } Slice first_key(key_ptr, non_shared); raw_key_.SetKey(first_key, false /* copy */); - int cmp = comp->Compare(applied_key_.UpdateAndGetKey(), target); + int cmp = CompareCurrentKey(target); *skip_linear_scan = cmp >= 0; } return true; @@ -780,7 +771,7 @@ int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { } Slice block_key(key_ptr, non_shared); raw_key_.SetKey(block_key, false /* copy */); - return comparator_->Compare(applied_key_.UpdateAndGetKey(), target); + return CompareCurrentKey(target); } // Binary search in block_ids to find the first block @@ -874,7 +865,7 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, assert(prefix_index_); *prefix_may_exist = true; Slice seek_key = target; - if (!key_includes_seq_) { + if (raw_key_.IsUserKey()) { seek_key = ExtractUserKey(target); } uint32_t* block_ids = nullptr; @@ -989,8 +980,7 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, } } -DataBlockIter* Block::NewDataIterator(const Comparator* cmp, - const Comparator* ucmp, +DataBlockIter* Block::NewDataIterator(const Comparator* ucmp, SequenceNumber global_seqno, DataBlockIter* iter, Statistics* stats, bool block_contents_pinned) { @@ -1010,7 +1000,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* cmp, return ret_iter; } else { ret_iter->Initialize( - cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno, + ucmp, data_, restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); if (read_amp_bitmap_) { @@ -1025,10 +1015,10 @@ DataBlockIter* Block::NewDataIterator(const Comparator* cmp, } IndexBlockIter* Block::NewIndexIterator( - const Comparator* cmp, const Comparator* ucmp, SequenceNumber global_seqno, - IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, - bool have_first_key, bool key_includes_seq, bool value_is_full, - bool block_contents_pinned, BlockPrefixIndex* prefix_index) { + const Comparator* ucmp, SequenceNumber global_seqno, IndexBlockIter* iter, + Statistics* /*stats*/, bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, bool block_contents_pinned, + BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -1046,7 +1036,7 @@ IndexBlockIter* Block::NewIndexIterator( } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; - ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, + ret_iter->Initialize(ucmp, data_, restart_offset_, num_restarts_, global_seqno, prefix_index_ptr, have_first_key, key_includes_seq, value_is_full, block_contents_pinned); diff --git a/table/block_based/block.h b/table/block_based/block.h index a7378949a..0da4393a8 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -168,8 +168,8 @@ class Block { BlockBasedTableOptions::DataBlockIndexType IndexType() const; - // If comparator is InternalKeyComparator, user_comparator is its user - // comparator; they are equal otherwise. + // ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key + // comparator. // // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* @@ -187,13 +187,15 @@ class Block { // NOTE: for the hash based lookup, if a key prefix doesn't match any key, // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. - DataBlockIter* NewDataIterator(const Comparator* comparator, - const Comparator* user_comparator, + DataBlockIter* NewDataIterator(const Comparator* ucmp, SequenceNumber global_seqno, DataBlockIter* iter = nullptr, Statistics* stats = nullptr, bool block_contents_pinned = false); + // ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key + // comparator. + // // key_includes_seq, default true, means that the keys are in internal key // format. // value_is_full, default true, means that no delta encoding is @@ -206,8 +208,7 @@ class Block { // first_internal_key. It affects data serialization format, so the same value // have_first_key must be used when writing and reading index. // It is determined by IndexType property of the table. - IndexBlockIter* NewIndexIterator(const Comparator* comparator, - const Comparator* user_comparator, + IndexBlockIter* NewIndexIterator(const Comparator* ucmp, SequenceNumber global_seqno, IndexBlockIter* iter, Statistics* stats, bool total_order_seek, bool have_first_key, @@ -228,61 +229,36 @@ class Block { DataBlockHashIndex data_block_hash_index_; }; -// A GlobalSeqnoAppliedKey exposes a key with global sequence number applied -// if configured with `global_seqno != kDisableGlobalSequenceNumber`. It may -// hold a user key or an internal key since `format_version>=3` index blocks -// contain user keys. In case it holds user keys, it must be configured with -// `global_seqno == kDisableGlobalSequenceNumber`. -class GlobalSeqnoAppliedKey { - public: - void Initialize(IterKey* key, SequenceNumber global_seqno) { - key_ = key; - global_seqno_ = global_seqno; -#ifndef NDEBUG - init_ = true; -#endif // NDEBUG - } - - Slice UpdateAndGetKey() { - assert(init_); - if (global_seqno_ == kDisableGlobalSequenceNumber) { - return key_->GetKey(); - } - ParsedInternalKey parsed(Slice(), 0, kTypeValue); - if (!ParseInternalKey(key_->GetInternalKey(), &parsed)) { - assert(false); // error not handled in optimized builds - return Slice(); - } - parsed.sequence = global_seqno_; - scratch_.SetInternalKey(parsed); - return scratch_.GetInternalKey(); - } - - bool IsKeyPinned() const { - return global_seqno_ == kDisableGlobalSequenceNumber && key_->IsKeyPinned(); - } - - private: - const IterKey* key_; - SequenceNumber global_seqno_; - IterKey scratch_; -#ifndef NDEBUG - bool init_ = false; -#endif // NDEBUG -}; - +// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The +// format of this data buffer is an uncompressed, sorted sequence of key-value +// pairs (see `Block` API for more details). +// +// Notably, the keys may either be in internal key format or user key format. +// Subclasses are responsible for configuring the key format. +// +// `BlockIter` intends to provide final overrides for all of +// `InternalIteratorBase` functions that can move the iterator. It does +// this to guarantee `UpdateKey()` is called exactly once after each key +// movement potentially visible to users. In this step, the key is prepared +// (e.g., serialized if global seqno is in effect) so it can be returned +// immediately when the user asks for it via calling `key() const`. +// +// For its subclasses, it provides protected variants of the above-mentioned +// final-overridden methods. They are named with the "Impl" suffix, e.g., +// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These +// "Impl" functions are responsible for positioning `raw_key_` but not +// invoking `UpdateKey()`. template class BlockIter : public InternalIteratorBase { public: - void InitializeBase(const Comparator* comparator, const char* data, + void InitializeBase(const Comparator* ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, bool block_contents_pinned) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid - applied_key_.Initialize(&raw_key_, global_seqno); - - comparator_ = comparator; + ucmp_wrapper_ = UserComparatorWrapper(ucmp); + icmp_ = InternalKeyComparator(ucmp, false /* named */); data_ = data; restarts_ = restarts; num_restarts_ = num_restarts; @@ -309,6 +285,43 @@ class BlockIter : public InternalIteratorBase { } bool Valid() const override { return current_ < restarts_; } + + virtual void SeekToFirst() override final { + SeekToFirstImpl(); + UpdateKey(); + } + + virtual void SeekToLast() override final { + SeekToLastImpl(); + UpdateKey(); + } + + virtual void Seek(const Slice& target) override final { + SeekImpl(target); + UpdateKey(); + } + + virtual void SeekForPrev(const Slice& target) override final { + SeekForPrevImpl(target); + UpdateKey(); + } + + virtual void Next() override final { + NextImpl(); + UpdateKey(); + } + + virtual bool NextAndGetResult(IterateResult* result) override final { + // This does not need to call `UpdateKey()` as the parent class only has + // access to the `UpdateKey()`-invoking functions. + return InternalIteratorBase::NextAndGetResult(result); + } + + virtual void Prev() override final { + PrevImpl(); + UpdateKey(); + } + Status status() const override { return status_; } Slice key() const override { assert(Valid()); @@ -343,12 +356,10 @@ class BlockIter : public InternalIteratorBase { Cache::Handle* cache_handle() { return cache_handle_; } - virtual void Next() override = 0; - protected: - // Note: The type could be changed to InternalKeyComparator but we see a weird - // performance drop by that. - const Comparator* comparator_; + UserComparatorWrapper ucmp_wrapper_; + InternalKeyComparator icmp_; + const char* data_; // underlying block contents uint32_t num_restarts_; // Number of uint32_t entries in restart array @@ -359,13 +370,12 @@ class BlockIter : public InternalIteratorBase { uint32_t current_; // Raw key from block. IterKey raw_key_; - // raw_key_ with global seqno applied if necessary. Use this one for - // comparisons. - GlobalSeqnoAppliedKey applied_key_; - // Key to be exposed to users. - Slice key_; + // Buffer for key data when global seqno assignment is enabled. + IterKey key_buf_; Slice value_; Status status_; + // Key to be exposed to users. + Slice key_; bool key_pinned_; // Whether the block data is guaranteed to outlive this iterator, and // as long as the cleanup functions are transferred to another class, @@ -373,6 +383,50 @@ class BlockIter : public InternalIteratorBase { bool block_contents_pinned_; SequenceNumber global_seqno_; + virtual void SeekToFirstImpl() = 0; + virtual void SeekToLastImpl() = 0; + virtual void SeekImpl(const Slice& target) = 0; + virtual void SeekForPrevImpl(const Slice& target) = 0; + virtual void NextImpl() = 0; + virtual void PrevImpl() = 0; + + // Must be called every time a key is found that needs to be returned to user, + // and may be called when no key is found (as a no-op). Updates `key_`, + // `key_buf_`, and `key_pinned_` with info about the found key. + void UpdateKey() { + key_buf_.Clear(); + if (!Valid()) { + return; + } + if (raw_key_.IsUserKey()) { + assert(global_seqno_ == kDisableGlobalSequenceNumber); + key_ = raw_key_.GetUserKey(); + key_pinned_ = raw_key_.IsKeyPinned(); + } else if (global_seqno_ == kDisableGlobalSequenceNumber) { + key_ = raw_key_.GetInternalKey(); + key_pinned_ = raw_key_.IsKeyPinned(); + } else { + key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_, + ExtractValueType(raw_key_.GetInternalKey())); + key_ = key_buf_.GetInternalKey(); + key_pinned_ = false; + } + } + + // Returns the result of `Comparator::Compare()`, where the appropriate + // comparator is used for the block contents, the LHS argument is the current + // key with global seqno applied, and the RHS argument is `other`. + int CompareCurrentKey(const Slice& other) { + if (raw_key_.IsUserKey()) { + assert(global_seqno_ == kDisableGlobalSequenceNumber); + return ucmp_wrapper_.Compare(raw_key_.GetUserKey(), other); + } else if (global_seqno_ == kDisableGlobalSequenceNumber) { + return icmp_.Compare(raw_key_.GetInternalKey(), other); + } + return icmp_.Compare(raw_key_.GetInternalKey(), global_seqno_, other, + kDisableGlobalSequenceNumber); + } + private: // Store the cache handle, if the block is cached. We need this since the // only other place the handle is stored is as an argument to the Cleanable @@ -408,37 +462,31 @@ class BlockIter : public InternalIteratorBase { protected: template inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index, bool* is_index_key_result, - const Comparator* comp); + uint32_t* index, bool* is_index_key_result); void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, - bool is_index_key_result, const Comparator* comp); + bool is_index_key_result); }; class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} - DataBlockIter(const Comparator* comparator, const Comparator* user_comparator, - const char* data, uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, + DataBlockIter(const Comparator* ucmp, const char* data, uint32_t restarts, + uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, DataBlockHashIndex* data_block_hash_index) : DataBlockIter() { - Initialize(comparator, user_comparator, data, restarts, num_restarts, - global_seqno, read_amp_bitmap, block_contents_pinned, - data_block_hash_index); - } - void Initialize(const Comparator* comparator, - const Comparator* user_comparator, const char* data, - uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, + Initialize(ucmp, data, restarts, num_restarts, global_seqno, + read_amp_bitmap, block_contents_pinned, data_block_hash_index); + } + void Initialize(const Comparator* ucmp, const char* data, uint32_t restarts, + uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, DataBlockHashIndex* data_block_hash_index) { - InitializeBase(comparator, data, restarts, num_restarts, global_seqno, + InitializeBase(ucmp, data, restarts, num_restarts, global_seqno, block_contents_pinned); - user_comparator_ = user_comparator; raw_key_.SetIsUserKey(false); read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; @@ -456,36 +504,32 @@ class DataBlockIter final : public BlockIter { return value_; } - void Seek(const Slice& target) override; - inline bool SeekForGet(const Slice& target) { if (!data_block_hash_index_) { - Seek(target); + SeekImpl(target); + UpdateKey(); return true; } - - return SeekForGetImpl(target); + bool res = SeekForGetImpl(target); + UpdateKey(); + return res; } - void SeekForPrev(const Slice& target) override; - - void Prev() override; - - void Next() final override; - // Try to advance to the next entry in the block. If there is data corruption // or error, report it to the caller instead of aborting the process. May // incur higher CPU overhead because we need to perform check on every entry. - void NextOrReport(); - - void SeekToFirst() override; + void NextOrReport() { + NextOrReportImpl(); + UpdateKey(); + } // Try to seek to the first entry in the block. If there is data corruption // or error, report it to caller instead of aborting the process. May incur // higher CPU overhead because we need to perform check on every entry. - void SeekToFirstOrReport(); - - void SeekToLast() override; + void SeekToFirstOrReport() { + SeekToFirstOrReportImpl(); + UpdateKey(); + } void Invalidate(Status s) { InvalidateBase(s); @@ -495,6 +539,14 @@ class DataBlockIter final : public BlockIter { prev_entries_idx_ = -1; } + protected: + virtual void SeekToFirstImpl() override; + virtual void SeekToLastImpl() override; + virtual void SeekImpl(const Slice& target) override; + virtual void SeekForPrevImpl(const Slice& target) override; + virtual void NextImpl() override; + virtual void PrevImpl() override; + private: // read-amp bitmap BlockReadAmpBitmap* read_amp_bitmap_; @@ -525,12 +577,13 @@ class DataBlockIter final : public BlockIter { int32_t prev_entries_idx_ = -1; DataBlockHashIndex* data_block_hash_index_; - const Comparator* user_comparator_; template inline bool ParseNextDataKey(const char* limit = nullptr); bool SeekForGetImpl(const Slice& target); + void NextOrReportImpl(); + void SeekToFirstOrReportImpl(); }; class IndexBlockIter final : public BlockIter { @@ -541,22 +594,14 @@ class IndexBlockIter final : public BlockIter { // format. // value_is_full, default true, means that no delta encoding is // applied to values. - void Initialize(const Comparator* comparator, - const Comparator* user_comparator, const char* data, - uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, - bool have_first_key, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) { - if (!key_includes_seq) { - user_comparator_wrapper_ = std::unique_ptr( - new UserComparatorWrapper(user_comparator)); - } - InitializeBase( - key_includes_seq ? comparator : user_comparator_wrapper_.get(), data, - restarts, num_restarts, kDisableGlobalSequenceNumber, - block_contents_pinned); - key_includes_seq_ = key_includes_seq; - raw_key_.SetIsUserKey(!key_includes_seq_); + void Initialize(const Comparator* ucmp, const char* data, uint32_t restarts, + uint32_t num_restarts, SequenceNumber global_seqno, + BlockPrefixIndex* prefix_index, bool have_first_key, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned) { + InitializeBase(ucmp, data, restarts, num_restarts, + kDisableGlobalSequenceNumber, block_contents_pinned); + raw_key_.SetIsUserKey(!key_includes_seq); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; have_first_key_ = have_first_key; @@ -568,10 +613,8 @@ class IndexBlockIter final : public BlockIter { } Slice user_key() const override { - if (key_includes_seq_) { - return ExtractUserKey(key()); - } - return key(); + assert(Valid()); + return raw_key_.GetUserKey(); } IndexValue value() const override { @@ -588,6 +631,13 @@ class IndexBlockIter final : public BlockIter { } } + void Invalidate(Status s) { InvalidateBase(s); } + + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + + protected: // IndexBlockIter follows a different contract for prefix iterator // from data iterators. // If prefix of the seek key `target` exists in the file, it must @@ -595,9 +645,9 @@ class IndexBlockIter final : public BlockIter { // If the prefix of `target` doesn't exist in the file, it can either // return the result of total order seek, or set both of Valid() = false // and status() = NotFound(). - void Seek(const Slice& target) override; + void SeekImpl(const Slice& target) override; - void SeekForPrev(const Slice&) override { + void SeekForPrevImpl(const Slice&) override { assert(false); current_ = restarts_; restart_index_ = num_restarts_; @@ -608,24 +658,15 @@ class IndexBlockIter final : public BlockIter { value_.clear(); } - void Prev() override; + void PrevImpl() override; - void Next() override; + void NextImpl() override; - void SeekToFirst() override; + void SeekToFirstImpl() override; - void SeekToLast() override; - - void Invalidate(Status s) { InvalidateBase(s); } - - bool IsValuePinned() const override { - return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); - } + void SeekToLastImpl() override; private: - std::unique_ptr user_comparator_wrapper_; - // Key is in InternalKey format - bool key_includes_seq_; bool value_delta_encoded_; bool have_first_key_; // value includes first_internal_key BlockPrefixIndex* prefix_index_; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index a3b9092d5..ddbec3560 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1125,8 +1125,7 @@ Status BlockBasedTable::ReadMetaIndexBlock( *metaindex_block = std::move(metaindex); // meta block uses bytewise comparator. iter->reset(metaindex_block->get()->NewDataIterator( - BytewiseComparator(), BytewiseComparator(), - kDisableGlobalSequenceNumber)); + BytewiseComparator(), kDisableGlobalSequenceNumber)); return Status::OK(); } @@ -1381,15 +1380,14 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( lookup_context); } - template <> DataBlockIter* BlockBasedTable::InitBlockIterator( const Rep* rep, Block* block, BlockType block_type, DataBlockIter* input_iter, bool block_contents_pinned) { - return block->NewDataIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics, - block_contents_pinned); + return block->NewDataIterator(rep->internal_comparator.user_comparator(), + rep->get_global_seqno(block_type), input_iter, + rep->ioptions.statistics, + block_contents_pinned); } template <> @@ -1397,14 +1395,13 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator( const Rep* rep, Block* block, BlockType block_type, IndexBlockIter* input_iter, bool block_contents_pinned) { return block->NewIndexIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), + rep->internal_comparator.user_comparator(), rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics, /* total_order_seek */ true, rep->index_has_first_key, rep->index_key_includes_seq, rep->index_value_is_full, block_contents_pinned); } - // If contents is nullptr, this function looks up the block caches for the // data block referenced by handle, and read the block from disk if necessary. // If contents is non-null, it skips the cache lookup and disk read, since @@ -1945,7 +1942,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. return block->second.GetValue()->NewIndexIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), + rep->internal_comparator.user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, rep->index_has_first_key, rep->index_key_includes_seq, rep->index_value_is_full); diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 9776f59e5..7521c1515 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -34,8 +34,9 @@ static std::string RandomString(Random *rnd, int len) { test::RandomString(rnd, len, &r); return r; } -std::string GenerateKey(int primary_key, int secondary_key, int padding_size, - Random *rnd) { + +std::string GenerateInternalKey(int primary_key, int secondary_key, + int padding_size, Random *rnd) { char buf[50]; char *p = &buf[0]; snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); @@ -43,6 +44,7 @@ std::string GenerateKey(int primary_key, int secondary_key, int padding_size, if (padding_size) { k += RandomString(rnd, padding_size); } + AppendInternalKeyFooter(&k, 0 /* seqno */, kTypeValue); return k; } @@ -61,7 +63,8 @@ void GenerateRandomKVs(std::vector *keys, for (int i = from; i < from + len; i += step) { // generating keys that shares the prefix for (int j = 0; j < keys_share_prefix; ++j) { - keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + // `DataBlockIter` assumes it reads only internal keys. + keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd)); // 100 bytes values values->emplace_back(RandomString(&rnd, 100)); @@ -97,8 +100,8 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; - InternalIterator *iter = reader.NewDataIterator( - options.comparator, options.comparator, kDisableGlobalSequenceNumber); + InternalIterator *iter = + reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber); for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { // read kv from block Slice k = iter->key(); @@ -111,8 +114,8 @@ TEST_F(BlockTest, SimpleTest) { delete iter; // read block contents randomly - iter = reader.NewDataIterator(options.comparator, options.comparator, - kDisableGlobalSequenceNumber); + iter = + reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -158,9 +161,8 @@ void CheckBlockContents(BlockContents contents, const int max_key, std::unique_ptr prefix_extractor( NewFixedPrefixTransform(prefix_size)); - std::unique_ptr regular_iter( - reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator(), - kDisableGlobalSequenceNumber)); + std::unique_ptr regular_iter(reader2.NewDataIterator( + BytewiseComparator(), kDisableGlobalSequenceNumber)); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -177,7 +179,8 @@ void CheckBlockContents(BlockContents contents, const int max_key, // simply be set as invalid; whereas the binary search based iterator will // return the one that is closest. for (int i = 1; i < max_key - 1; i += 2) { - auto key = GenerateKey(i, 0, 0, nullptr); + // `DataBlockIter` assumes its APIs receive only internal keys. + auto key = GenerateInternalKey(i, 0, 0, nullptr); regular_iter->Seek(key); ASSERT_TRUE(regular_iter->Valid()); } @@ -382,8 +385,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { // read contents of block sequentially size_t read_bytes = 0; DataBlockIter *iter = reader.NewDataIterator( - options.comparator, options.comparator, kDisableGlobalSequenceNumber, - nullptr, stats.get()); + options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); read_bytes += iter->TEST_CurrentEntrySize(); @@ -414,8 +416,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { size_t read_bytes = 0; DataBlockIter *iter = reader.NewDataIterator( - options.comparator, options.comparator, kDisableGlobalSequenceNumber, - nullptr, stats.get()); + options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -449,8 +450,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { size_t read_bytes = 0; DataBlockIter *iter = reader.NewDataIterator( - options.comparator, options.comparator, kDisableGlobalSequenceNumber, - nullptr, stats.get()); + options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get()); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { int index = rnd.Uniform(num_records); @@ -574,9 +574,8 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { Statistics *kNullStats = nullptr; // read contents of block sequentially InternalIteratorBase *iter = reader.NewIndexIterator( - options.comparator, options.comparator, kDisableGlobalSequenceNumber, - kNullIter, kNullStats, kTotalOrderSeek, includeFirstKey(), kIncludesSeq, - kValueIsFull); + options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); iter->SeekToFirst(); for (int index = 0; index < num_records; ++index) { ASSERT_TRUE(iter->Valid()); @@ -595,10 +594,9 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { delete iter; // read block contents randomly - iter = reader.NewIndexIterator(options.comparator, options.comparator, - kDisableGlobalSequenceNumber, kNullIter, - kNullStats, kTotalOrderSeek, includeFirstKey(), - kIncludesSeq, kValueIsFull); + iter = reader.NewIndexIterator( + options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); for (int i = 0; i < num_records * 2; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 97971cd5b..409a5bdc2 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -391,7 +391,7 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) { Block reader(std::move(contents)); const InternalKeyComparator icmp(BytewiseComparator()); - auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator(), + auto iter = reader.NewDataIterator(icmp.user_comparator(), kDisableGlobalSequenceNumber); bool may_exist; // search in block for the key just inserted @@ -475,7 +475,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // random seek existent keys for (int i = 0; i < num_records; i++) { - auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator(), + auto iter = reader.NewDataIterator(icmp.user_comparator(), kDisableGlobalSequenceNumber); // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -513,7 +513,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // C true false for (int i = 0; i < num_records; i++) { - auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator(), + auto iter = reader.NewDataIterator(icmp.user_comparator(), kDisableGlobalSequenceNumber); // find a random key in the lookaside array int index = rnd.Uniform(num_records); diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index d15cbd217..7a281edbf 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -133,7 +133,7 @@ InternalIteratorBase* HashIndexReader::NewIterator( // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. auto it = index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), + internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, total_order_seek, index_has_first_key(), index_key_includes_seq(), index_value_is_full(), false /* block_contents_pinned */, diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index bdf9250db..dc25abbea 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -238,7 +238,7 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; filter_block.GetValue()->NewIndexIterator( - comparator, comparator->user_comparator(), + comparator->user_comparator(), table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter, kNullStats, true /* total_order_seek */, false /* have_first_key */, index_key_includes_seq(), index_value_is_full()); @@ -441,10 +441,10 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; filter_block.GetValue()->NewIndexIterator( - comparator, comparator->user_comparator(), - rep->get_global_seqno(BlockType::kFilter), &biter, kNullStats, - true /* total_order_seek */, false /* have_first_key */, - index_key_includes_seq(), index_value_is_full()); + comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilter), + &biter, kNullStats, true /* total_order_seek */, + false /* have_first_key */, index_key_includes_seq(), + index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index f8bb9562f..e8cfd0cf6 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -70,7 +70,7 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( new BlockBasedTable::PartitionedIndexIteratorState(table(), &partition_map_), index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), + internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, index_has_first_key(), index_key_includes_seq(), index_value_is_full())); @@ -82,7 +82,7 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( // to set `block_contents_pinned`. std::unique_ptr> index_iter( index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), + internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, index_has_first_key(), index_key_includes_seq(), index_value_is_full())); @@ -126,7 +126,7 @@ void PartitionIndexReader::CacheDependencies(const ReadOptions& ro, bool pin) { // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. index_block.GetValue()->NewIndexIterator( - internal_comparator(), internal_comparator()->user_comparator(), + internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true, index_has_first_key(), index_key_includes_seq(), index_value_is_full()); // Index partitions are assumed to be consecuitive. Prefetch them all. diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 21c4c1fb0..acc9c80a8 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -231,7 +231,7 @@ Status ReadProperties(const ReadOptions& read_options, Block properties_block(std::move(block_contents)); DataBlockIter iter; - properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(), + properties_block.NewDataIterator(BytewiseComparator(), kDisableGlobalSequenceNumber, &iter); auto new_table_properties = new TableProperties(); @@ -395,8 +395,7 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, // are to compress it. Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter(metaindex_block.NewDataIterator( - BytewiseComparator(), BytewiseComparator(), - kDisableGlobalSequenceNumber)); + BytewiseComparator(), kDisableGlobalSequenceNumber)); // -- Read property block bool found_properties_block = true; @@ -468,8 +467,7 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewDataIterator( - BytewiseComparator(), BytewiseComparator(), - kDisableGlobalSequenceNumber)); + BytewiseComparator(), kDisableGlobalSequenceNumber)); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } @@ -514,8 +512,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewDataIterator( - BytewiseComparator(), BytewiseComparator(), - kDisableGlobalSequenceNumber)); + BytewiseComparator(), kDisableGlobalSequenceNumber)); BlockHandle block_handle; status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); diff --git a/table/table_test.cc b/table/table_test.cc index b77845ce5..7bc53a584 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -211,47 +211,6 @@ class Constructor { stl_wrappers::KVMap data_; }; -class BlockConstructor: public Constructor { - public: - explicit BlockConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp), - block_(nullptr) { } - ~BlockConstructor() override { delete block_; } - Status FinishImpl(const Options& /*options*/, - const ImmutableCFOptions& /*ioptions*/, - const MutableCFOptions& /*moptions*/, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& /*internal_comparator*/, - const stl_wrappers::KVMap& kv_map) override { - delete block_; - block_ = nullptr; - BlockBuilder builder(table_options.block_restart_interval); - - for (const auto& kv : kv_map) { - builder.Add(kv.first, kv.second); - } - // Open the block - data_ = builder.Finish().ToString(); - BlockContents contents; - contents.data = data_; - block_ = new Block(std::move(contents)); - return Status::OK(); - } - InternalIterator* NewIterator( - const SliceTransform* /*prefix_extractor*/) const override { - return block_->NewDataIterator(comparator_, comparator_, - kDisableGlobalSequenceNumber); - } - - private: - const Comparator* comparator_; - std::string data_; - Block* block_; - - BlockConstructor(); -}; - // A helper class that converts internal format keys into user keys class KeyConvertingIterator : public InternalIterator { public: @@ -309,7 +268,56 @@ class KeyConvertingIterator : public InternalIterator { void operator=(const KeyConvertingIterator&); }; -class TableConstructor: public Constructor { +// `BlockConstructor` APIs always accept/return user keys. +class BlockConstructor : public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), comparator_(cmp), block_(nullptr) {} + ~BlockConstructor() override { delete block_; } + Status FinishImpl(const Options& /*options*/, + const ImmutableCFOptions& /*ioptions*/, + const MutableCFOptions& /*moptions*/, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { + delete block_; + block_ = nullptr; + BlockBuilder builder(table_options.block_restart_interval); + + for (const auto& kv : kv_map) { + // `DataBlockIter` assumes it reads only internal keys. `BlockConstructor` + // clients provide user keys, so we need to convert to internal key format + // before writing the data block. + ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + builder.Add(encoded, kv.second); + } + // Open the block + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + block_ = new Block(std::move(contents)); + return Status::OK(); + } + InternalIterator* NewIterator( + const SliceTransform* /*prefix_extractor*/) const override { + // `DataBlockIter` returns the internal keys it reads. + // `KeyConvertingIterator` converts them to user keys before they are + // exposed to the `BlockConstructor` clients. + return new KeyConvertingIterator( + block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber)); + } + + private: + const Comparator* comparator_; + std::string data_; + Block* block_; + + BlockConstructor(); +}; + +class TableConstructor : public Constructor { public: explicit TableConstructor(const Comparator* cmp, bool convert_to_internal_key = false, @@ -4350,8 +4358,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter(metaindex_block.NewDataIterator( - BytewiseComparator(), BytewiseComparator(), - kDisableGlobalSequenceNumber)); + BytewiseComparator(), kDisableGlobalSequenceNumber)); bool found_properties_block = true; ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block)); ASSERT_TRUE(found_properties_block); @@ -4430,7 +4437,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // verify properties block comes last std::unique_ptr metaindex_iter{ - metaindex_block.NewDataIterator(options.comparator, options.comparator, + metaindex_block.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber)}; uint64_t max_offset = 0; std::string key_at_max_offset; diff --git a/util/user_comparator_wrapper.h b/util/user_comparator_wrapper.h index e8d18237a..3b67debd0 100644 --- a/util/user_comparator_wrapper.h +++ b/util/user_comparator_wrapper.h @@ -17,6 +17,10 @@ namespace ROCKSDB_NAMESPACE { // perf_context.user_key_comparison_count. class UserComparatorWrapper final : public Comparator { public: + // `UserComparatorWrapper`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + UserComparatorWrapper() : user_comparator_(nullptr) {} + explicit UserComparatorWrapper(const Comparator* const user_cmp) : Comparator(user_cmp->timestamp_size()), user_comparator_(user_cmp) {}