// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // // Decodes the blocks generated by block_builder.cc. #include "table/block_based/block.h" #include #include #include #include #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" #include "table/block_based/block_prefix_index.h" #include "table/block_based/data_block_footer.h" #include "table/format.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and // "*value_length", respectively. Will not dereference past "limit". // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). struct DecodeEntry { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared, uint32_t* value_length) { // We need 2 bytes for shared and non_shared size. We also need one more // byte either for value size or the actual value in case of value delta // encoding. assert(limit - p >= 3); *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; *value_length = reinterpret_cast(p)[2]; if ((*shared | *non_shared | *value_length) < 128) { // Fast path: all three values are encoded in one byte each p += 3; } else { if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { return nullptr; } } // Using an assert in place of "return null" since we should not pay the // cost of checking for corruption on every single key decoding assert(!(static_cast(limit - p) < (*non_shared + *value_length))); return p; } }; // Helper routine: similar to DecodeEntry but does not have assertions. // Instead, returns nullptr so that caller can detect and report failure. struct CheckAndDecodeEntry { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared, uint32_t* value_length) { // We need 2 bytes for shared and non_shared size. We also need one more // byte either for value size or the actual value in case of value delta // encoding. if (limit - p < 3) { return nullptr; } *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; *value_length = reinterpret_cast(p)[2]; if ((*shared | *non_shared | *value_length) < 128) { // Fast path: all three values are encoded in one byte each p += 3; } else { if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { return nullptr; } } if (static_cast(limit - p) < (*non_shared + *value_length)) { return nullptr; } return p; } }; struct DecodeKey { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared) { uint32_t value_length; return DecodeEntry()(p, limit, shared, non_shared, &value_length); } }; // In format_version 4, which is used by index blocks, the value size is not // encoded before the entry, as the value is known to be the handle with the // known size. struct DecodeKeyV4 { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared) { // We need 2 bytes for shared and non_shared size. We also need one more // byte either for value size or the actual value in case of value delta // encoding. if (limit - p < 3) return nullptr; *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; if ((*shared | *non_shared) < 128) { // Fast path: all three values are encoded in one byte each p += 2; } else { if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; } return p; } }; struct DecodeEntryV4 { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared, uint32_t* value_length) { assert(value_length); *value_length = 0; return DecodeKeyV4()(p, limit, shared, non_shared); } }; void DataBlockIter::NextImpl() { #ifndef NDEBUG if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return; #endif bool is_shared = false; ParseNextDataKey(&is_shared); ++cur_entry_idx_; } void MetaBlockIter::NextImpl() { bool is_shared = false; ParseNextKey(&is_shared); ++cur_entry_idx_; } void IndexBlockIter::NextImpl() { ParseNextIndexKey(); ++cur_entry_idx_; } void IndexBlockIter::PrevImpl() { assert(Valid()); // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { if (restart_index_ == 0) { // No more entries current_ = restarts_; restart_index_ = num_restarts_; return; } restart_index_--; } SeekToRestartPoint(restart_index_); // Loop until end of current entry hits the start of original entry while (ParseNextIndexKey() && NextEntryOffset() < original) { } --cur_entry_idx_; } void MetaBlockIter::PrevImpl() { assert(Valid()); // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { if (restart_index_ == 0) { // No more entries current_ = restarts_; restart_index_ = num_restarts_; return; } restart_index_--; } SeekToRestartPoint(restart_index_); bool is_shared = false; // Loop until end of current entry hits the start of original entry while (ParseNextKey(&is_shared) && NextEntryOffset() < original) { } --cur_entry_idx_; } // Similar to IndexBlockIter::PrevImpl but also caches the prev entries void DataBlockIter::PrevImpl() { assert(Valid()); assert(prev_entries_idx_ == -1 || static_cast(prev_entries_idx_) < prev_entries_.size()); --cur_entry_idx_; // Check if we can use cached prev_entries_ if (prev_entries_idx_ > 0 && prev_entries_[prev_entries_idx_].offset == current_) { // Read cached CachedPrevEntry prev_entries_idx_--; const CachedPrevEntry& current_prev_entry = prev_entries_[prev_entries_idx_]; const char* key_ptr = nullptr; bool raw_key_cached; if (current_prev_entry.key_ptr != nullptr) { // The key is not delta encoded and stored in the data block key_ptr = current_prev_entry.key_ptr; raw_key_cached = false; } else { // The key is delta encoded and stored in prev_entries_keys_buff_ key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; raw_key_cached = true; } const Slice current_key(key_ptr, current_prev_entry.key_size); current_ = current_prev_entry.offset; // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience, // not necessity. It is convenient since this class treats keys as pinned // when `raw_key_` points to an outside buffer. So we cannot allow // `raw_key_` point into Prev cache as it is a transient outside buffer // (i.e., keys in it are not actually pinned). raw_key_.SetKey(current_key, raw_key_cached /* copy */); value_ = current_prev_entry.value; return; } // Clear prev entries cache prev_entries_idx_ = -1; prev_entries_.clear(); prev_entries_keys_buff_.clear(); // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { if (restart_index_ == 0) { // No more entries current_ = restarts_; restart_index_ = num_restarts_; return; } restart_index_--; } SeekToRestartPoint(restart_index_); do { bool is_shared = false; if (!ParseNextDataKey(&is_shared)) { break; } Slice current_key = raw_key_.GetKey(); if (raw_key_.IsKeyPinned()) { // The key is not delta encoded prev_entries_.emplace_back(current_, current_key.data(), 0, current_key.size(), value()); } else { // The key is delta encoded, cache decoded key in buffer size_t new_key_offset = prev_entries_keys_buff_.size(); prev_entries_keys_buff_.append(current_key.data(), current_key.size()); prev_entries_.emplace_back(current_, nullptr, new_key_offset, current_key.size(), value()); } // Loop until end of current entry hits the start of original entry } while (NextEntryOffset() < original); prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; } void DataBlockIter::SeekImpl(const Slice& target) { Slice seek_key = target; PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); if (!ok) { return; } FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } void MetaBlockIter::SeekImpl(const Slice& target) { Slice seek_key = target; PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); if (!ok) { return; } FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } // Optimized Seek for point lookup for an internal key `target` // target = "seek_user_key @ type | seqno". // // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, // kTypeBlobIndex, kTypeWideColumnEntity or kTypeMerge, this function behaves // identically to Seek(). // // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, // kTypeBlobIndex, kTypeWideColumnEntity, or kTypeMerge: // // If the return value is FALSE, iter location is undefined, and it means: // 1) there is no key in this block falling into the range: // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], // inclusive; AND // 2) the last key of this block has a greater user_key from seek_user_key // // If the return value is TRUE, iter location has two possibilities: // 1) If iter is valid, it is set to a location as if set by SeekImpl(target). // In this case, it points to the first key with a larger user_key or a // matching user_key with a seqno no greater than the seeking seqno. // 2) If the iter is invalid, it means that either all the user_key is less // than the seek_user_key, or the block ends with a matching user_key but // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno // but larger type). bool DataBlockIter::SeekForGetImpl(const Slice& target) { Slice target_user_key = ExtractUserKey(target); uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, target_user_key); if (entry == kCollision) { // HashSeek not effective, falling back SeekImpl(target); return true; } if (entry == kNoEntry) { // Even if we cannot find the user_key in this block, the result may // exist in the next block. Consider this example: // // Block N: [aab@100, ... , app@120] // boundary key: axy@50 (we make minimal assumption about a boundary key) // Block N+1: [axy@10, ... ] // // If seek_key = axy@60, the search will start from Block N. // Even if the user_key is not found in the hash map, the caller still // have to continue searching the next block. // // In this case, we pretend the key is in the last restart interval. // The while-loop below will search the last restart interval for the // key. It will stop at the first key that is larger than the seek_key, // or to the end of the block if no one is larger. entry = static_cast(num_restarts_ - 1); } uint32_t restart_index = entry; // check if the key is in the restart_interval assert(restart_index < num_restarts_); SeekToRestartPoint(restart_index); current_ = GetRestartPoint(restart_index); cur_entry_idx_ = static_cast(restart_index * block_restart_interval_) - 1; uint32_t limit = restarts_; if (restart_index + 1 < num_restarts_) { limit = GetRestartPoint(restart_index + 1); } while (current_ < limit) { ++cur_entry_idx_; bool shared; // Here we only linear seek the target key inside the restart interval. // If a key does not exist inside a restart interval, we avoid // further searching the block content across restart interval boundary. // // TODO(fwu): check the left and right boundary of the restart interval // to avoid linear seek a target key that is out of range. if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) { // we stop at the first potential matching user key. break; } // If the loop exits due to CompareCurrentKey(target) >= 0, then current key // exists, and its checksum verification will be done in UpdateKey() called // in SeekForGet(). // TODO(cbi): If this loop exits with current_ == restart_, per key-value // checksum will not be verified in UpdateKey() since Valid() // will return false. } if (current_ == restarts_) { // Search reaches to the end of the block. There are three possibilities: // 1) there is only one user_key match in the block (otherwise collision). // the matching user_key resides in the last restart interval, and it // is the last key of the restart interval and of the block as well. // ParseNextKey() skipped it as its [ type | seqno ] is smaller. // // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, // AND all existing user_keys in the restart interval are smaller than // seek_user_key. // // 3) The seek_key is a false positive and happens to be hashed to the // last restart interval, AND all existing user_keys in the restart // interval are smaller than seek_user_key. // // The result may exist in the next block each case, so we return true. return true; } if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), target_user_key) != 0) { // the key is not in this block and cannot be at the next block either. return false; } // Here we are conservative and only support a limited set of cases ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); if (value_type != ValueType::kTypeValue && value_type != ValueType::kTypeDeletion && value_type != ValueType::kTypeMerge && value_type != ValueType::kTypeSingleDeletion && value_type != ValueType::kTypeBlobIndex && value_type != ValueType::kTypeWideColumnEntity) { SeekImpl(target); } // Result found, and the iter is correctly set. return true; } void IndexBlockIter::SeekImpl(const Slice& target) { #ifndef NDEBUG if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return; #endif TEST_SYNC_POINT("IndexBlockIter::Seek:0"); PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } Slice seek_key = target; if (raw_key_.IsUserKey()) { seek_key = ExtractUserKey(target); } status_ = Status::OK(); uint32_t index = 0; bool skip_linear_scan = false; bool ok = false; if (prefix_index_) { bool prefix_may_exist = true; ok = PrefixSeek(target, &index, &prefix_may_exist); if (!prefix_may_exist) { // This is to let the caller to distinguish between non-existing prefix, // and when key is larger than the last key, which both set Valid() to // false. current_ = restarts_; status_ = Status::NotFound(); } // restart interval must be one when hash search is enabled so the binary // search simply lands at the right place. skip_linear_scan = true; } else if (value_delta_encoded_) { ok = BinarySeek(seek_key, &index, &skip_linear_scan); } else { ok = BinarySeek(seek_key, &index, &skip_linear_scan); } if (!ok) { return; } FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); } void DataBlockIter::SeekForPrevImpl(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); Slice seek_key = target; if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); if (!ok) { return; } FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { if (status_.ok()) { SeekToLastImpl(); } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); } } } void MetaBlockIter::SeekForPrevImpl(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); Slice seek_key = target; if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool skip_linear_scan = false; bool ok = BinarySeek(seek_key, &index, &skip_linear_scan); if (!ok) { return; } FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { if (status_.ok()) { SeekToLastImpl(); } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); } } } void DataBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); bool is_shared = false; ParseNextDataKey(&is_shared); cur_entry_idx_ = 0; } void MetaBlockIter::SeekToFirstImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); bool is_shared = false; ParseNextKey(&is_shared); cur_entry_idx_ = 0; } void IndexBlockIter::SeekToFirstImpl() { #ifndef NDEBUG if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return; #endif if (data_ == nullptr) { // Not init yet return; } status_ = Status::OK(); SeekToRestartPoint(0); ParseNextIndexKey(); cur_entry_idx_ = 0; } void DataBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { // Keep skipping ++cur_entry_idx_; } } void MetaBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; assert(num_restarts_ >= 1); cur_entry_idx_ = static_cast((num_restarts_ - 1) * block_restart_interval_); while (ParseNextKey(&is_shared) && NextEntryOffset() < restarts_) { // Will probably never reach here since restart_interval is always 1 ++cur_entry_idx_; } } void IndexBlockIter::SeekToLastImpl() { if (data_ == nullptr) { // Not init yet return; } status_ = Status::OK(); SeekToRestartPoint(num_restarts_ - 1); cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { ++cur_entry_idx_; } } template template bool BlockIter::ParseNextKey(bool* is_shared) { current_ = NextEntryOffset(); const char* p = data_ + current_; const char* limit = data_ + restarts_; // Restarts come right after data if (p >= limit) { // No more entries to return. Mark as invalid. current_ = restarts_; restart_index_ = num_restarts_; return false; } // Decode next entry uint32_t shared, non_shared, value_length; p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); if (p == nullptr || raw_key_.Size() < shared) { CorruptionError(); return false; } else { if (shared == 0) { *is_shared = false; // If this key doesn't share any bytes with prev key, and no min timestamp // needs to be padded to the key, then we don't need to decode it and // can use its address in the block directly (no copy). UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared)); } else { // This key share `shared` bytes with prev key, we need to decode it *is_shared = true; // If user-defined timestamp is stripped from user key before keys are // delta encoded, the decoded key consisting of the shared and non shared // bytes do not have user-defined timestamp yet. We need to pad min // timestamp to it. if (pad_min_timestamp_) { raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_); } else { raw_key_.TrimAppend(shared, p, non_shared); } } value_ = Slice(p + non_shared, value_length); if (shared == 0) { while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_) { ++restart_index_; } } // else we are in the middle of a restart interval and the restart_index_ // thus has not changed return true; } } bool DataBlockIter::ParseNextDataKey(bool* is_shared) { if (ParseNextKey(is_shared)) { #ifndef NDEBUG if (global_seqno_ != kDisableGlobalSequenceNumber) { // If we are reading a file with a global sequence number we should // expect that all encoded sequence numbers are zeros and any value // type is kTypeValue, kTypeMerge, kTypeDeletion, // kTypeDeletionWithTimestamp, or kTypeRangeDeletion. uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey()); SequenceNumber seqno; ValueType value_type; UnPackSequenceAndType(packed, &seqno, &value_type); assert(value_type == ValueType::kTypeValue || value_type == ValueType::kTypeMerge || value_type == ValueType::kTypeDeletion || value_type == ValueType::kTypeDeletionWithTimestamp || value_type == ValueType::kTypeRangeDeletion); assert(seqno == 0); } #endif // NDEBUG return true; } else { return false; } } bool IndexBlockIter::ParseNextIndexKey() { bool is_shared = false; bool ok = (value_delta_encoded_) ? ParseNextKey(&is_shared) : ParseNextKey(&is_shared); if (ok) { if (value_delta_encoded_ || global_seqno_state_ != nullptr || pad_min_timestamp_) { DecodeCurrentValue(is_shared); } } return ok; } // The format: // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // ... // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // where, k is key, v is value, and its encoding is in parentheses. // The format of each key is (shared_size, non_shared_size, shared, non_shared) // The format of each value, i.e., block handle, is (offset, size) whenever the // is_shared is false, which included the first entry in each restart point. // Otherwise, the format is delta-size = the size of current block - the size o // last block. void IndexBlockIter::DecodeCurrentValue(bool is_shared) { Slice v(value_.data(), data_ + restarts_ - value_.data()); // Delta encoding is used if `shared` != 0. Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( &v, have_first_key_, (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr); assert(decode_s.ok()); value_ = Slice(value_.data(), v.data() - value_.data()); if (global_seqno_state_ != nullptr) { // Overwrite sequence number the same way as in DataBlockIter. IterKey& first_internal_key = global_seqno_state_->first_internal_key; first_internal_key.SetInternalKey(decoded_value_.first_internal_key, /* copy */ true); assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); ValueType value_type = ExtractValueType(first_internal_key.GetKey()); assert(value_type == ValueType::kTypeValue || value_type == ValueType::kTypeMerge || value_type == ValueType::kTypeDeletion || value_type == ValueType::kTypeRangeDeletion); first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, value_type); decoded_value_.first_internal_key = first_internal_key.GetKey(); } if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) { first_internal_key_with_ts_.clear(); PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_, decoded_value_.first_internal_key, ts_sz_); decoded_value_.first_internal_key = first_internal_key_with_ts_; } } template void BlockIter::FindKeyAfterBinarySeek(const Slice& target, uint32_t index, bool skip_linear_scan) { // SeekToRestartPoint() only does the lookup in the restart block. We need // to follow it up with NextImpl() to position the iterator at the restart // key. SeekToRestartPoint(index); cur_entry_idx_ = static_cast(index * block_restart_interval_) - 1; NextImpl(); if (!skip_linear_scan) { // Linear search (within restart block) for first key >= target uint32_t max_offset; if (index + 1 < num_restarts_) { // We are in a non-last restart interval. Since `BinarySeek()` guarantees // the next restart key is strictly greater than `target`, we can // terminate upon reaching it without any additional key comparison. max_offset = GetRestartPoint(index + 1); } else { // We are in the last restart interval. The while-loop will terminate by // `Valid()` returning false upon advancing past the block's last key. max_offset = std::numeric_limits::max(); } while (true) { NextImpl(); if (!Valid()) { // TODO(cbi): per key-value checksum will not be verified in UpdateKey() // since Valid() will returns false. break; } if (current_ == max_offset) { assert(CompareCurrentKey(target) > 0); break; } else if (CompareCurrentKey(target) >= 0) { break; } } } } // Binary searches in restart array to find the starting restart point for the // linear scan, and stores it in `*index`. Assumes restart array does not // contain duplicate keys. It is guaranteed that the restart key at `*index + 1` // is strictly greater than `target` or does not exist (this can be used to // elide a comparison when linear scan reaches all the way to the next restart // key). Furthermore, `*skip_linear_scan` is set to indicate whether the // `*index`th restart key is the final result so that key does not need to be // compared again later. template template bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, bool* skip_linear_scan) { if (restarts_ == 0) { // SST files dedicated to range tombstones are written with index blocks // that have no keys while also having `num_restarts_ == 1`. This would // cause a problem for `BinarySeek()` as it'd try to access the first key // which does not exist. We identify such blocks by the offset at which // their restarts are stored, and return false to prevent any attempted // key accesses. return false; } *skip_linear_scan = false; // Loop invariants: // - Restart key at index `left` is less than or equal to the target key. The // sentinel index `-1` is considered to have a key that is less than all // keys. // - Any restart keys after index `right` are strictly greater than the target // key. int64_t left = -1, right = num_restarts_ - 1; while (left != right) { // The `mid` is computed by rounding up so it lands in (`left`, `right`]. int64_t mid = left + (right - left + 1) / 2; uint32_t region_offset = GetRestartPoint(static_cast(mid)); uint32_t shared, non_shared; const char* key_ptr = DecodeKeyFunc()( data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; } Slice mid_key(key_ptr, non_shared); UpdateRawKeyAndMaybePadMinTimestamp(mid_key); int cmp = CompareCurrentKey(target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all // blocks before "mid" are uninteresting. left = mid; } else if (cmp > 0) { // Key at "mid" is >= "target". Therefore all blocks at or // after "mid" are uninteresting. right = mid - 1; } else { *skip_linear_scan = true; left = right = mid; } } if (left == -1) { // All keys in the block were strictly greater than `target`. So the very // first key in the block is the final seek result. *skip_linear_scan = true; *index = 0; } else { *index = static_cast(left); } return true; } // Compare target key and the block key of the block of `block_index`. // Return -1 if error. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { uint32_t region_offset = GetRestartPoint(block_index); uint32_t shared, non_shared; const char* key_ptr = value_delta_encoded_ ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, &non_shared) : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return 1; // Return target is smaller } Slice block_key(key_ptr, non_shared); raw_key_.SetKey(block_key, false /* copy */); return CompareCurrentKey(target); } // Binary search in block_ids to find the first block // with a key >= target bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, uint32_t right, uint32_t* index, bool* prefix_may_exist) { assert(left <= right); assert(index); assert(prefix_may_exist); *prefix_may_exist = true; uint32_t left_bound = left; while (left <= right) { uint32_t mid = (right + left) / 2; int cmp = CompareBlockKey(block_ids[mid], target); if (!status_.ok()) { return false; } if (cmp < 0) { // Key at "target" is larger than "mid". Therefore all // blocks before or at "mid" are uninteresting. left = mid + 1; } else { // Key at "target" is <= "mid". Therefore all blocks // after "mid" are uninteresting. // If there is only one block left, we found it. if (left == right) break; right = mid; } } if (left == right) { // In one of the two following cases: // (1) left is the first one of block_ids // (2) there is a gap of blocks between block of `left` and `left-1`. // we can further distinguish the case of key in the block or key not // existing, by comparing the target key and the key of the previous // block to the left of the block found. if (block_ids[left] > 0 && (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && CompareBlockKey(block_ids[left] - 1, target) > 0) { current_ = restarts_; *prefix_may_exist = false; return false; } *index = block_ids[left]; return true; } else { assert(left > right); // If the next block key is larger than seek key, it is possible that // no key shares the prefix with `target`, or all keys with the same // prefix as `target` are smaller than prefix. In the latter case, // we are mandated to set the position the same as the total order. // In the latter case, either: // (1) `target` falls into the range of the next block. In this case, // we can place the iterator to the next block, or // (2) `target` is larger than all block keys. In this case we can // keep the iterator invalidate without setting `prefix_may_exist` // to false. // We might sometimes end up with setting the total order position // while there is no key sharing the prefix as `target`, but it // still follows the contract. uint32_t right_index = block_ids[right]; assert(right_index + 1 <= num_restarts_); if (right_index + 1 < num_restarts_) { if (CompareBlockKey(right_index + 1, target) >= 0) { *index = right_index + 1; return true; } else { // We have to set the flag here because we are not positioning // the iterator to the total order position. *prefix_may_exist = false; } } // Mark iterator invalid current_ = restarts_; return false; } } bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist) { assert(index); assert(prefix_may_exist); assert(prefix_index_); *prefix_may_exist = true; Slice seek_key = target; if (raw_key_.IsUserKey()) { seek_key = ExtractUserKey(target); } uint32_t* block_ids = nullptr; uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); if (num_blocks == 0) { current_ = restarts_; *prefix_may_exist = false; return false; } else { assert(block_ids); return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index, prefix_may_exist); } } uint32_t Block::NumRestarts() const { assert(size_ >= 2 * sizeof(uint32_t)); uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); uint32_t num_restarts = block_footer; if (size_ > kMaxBlockSizeSupportedByHashIndex) { // In BlockBuilder, we have ensured a block with HashIndex is less than // kMaxBlockSizeSupportedByHashIndex (64KiB). // // Therefore, if we encounter a block with a size > 64KiB, the block // cannot have HashIndex. So the footer will directly interpreted as // num_restarts. // // Such check is for backward compatibility. We can ensure legacy block // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted // correctly as no HashIndex even if the MSB of num_restarts is set. return num_restarts; } BlockBasedTableOptions::DataBlockIndexType index_type; UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); return num_restarts; } BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { assert(size_ >= 2 * sizeof(uint32_t)); if (size_ > kMaxBlockSizeSupportedByHashIndex) { // The check is for the same reason as that in NumRestarts() return BlockBasedTableOptions::kDataBlockBinarySearch; } uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); uint32_t num_restarts = block_footer; BlockBasedTableOptions::DataBlockIndexType index_type; UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); return index_type; } Block::~Block() { // This sync point can be re-enabled if RocksDB can control the // initialization order of any/all static options created by the user. // TEST_SYNC_POINT("Block::~Block"); delete[] kv_checksum_; } Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, Statistics* statistics) : contents_(std::move(contents)), data_(contents_.data.data()), size_(contents_.data.size()), restart_offset_(0), num_restarts_(0) { TEST_SYNC_POINT("Block::Block:0"); if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { // Should only decode restart points for uncompressed blocks num_restarts_ = NumRestarts(); switch (IndexType()) { case BlockBasedTableOptions::kDataBlockBinarySearch: restart_offset_ = static_cast(size_) - (1 + num_restarts_) * sizeof(uint32_t); if (restart_offset_ > size_ - sizeof(uint32_t)) { // The size is too small for NumRestarts() and therefore // restart_offset_ wrapped around. size_ = 0; } break; case BlockBasedTableOptions::kDataBlockBinaryAndHash: if (size_ < sizeof(uint32_t) /* block footer */ + sizeof(uint16_t) /* NUM_BUCK */) { size_ = 0; break; } uint16_t map_offset; data_block_hash_index_.Initialize( contents.data.data(), static_cast(contents.data.size() - sizeof(uint32_t)), /*chop off NUM_RESTARTS*/ &map_offset); restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); if (restart_offset_ > map_offset) { // map_offset is too small for NumRestarts() and // therefore restart_offset_ wrapped around. size_ = 0; break; } break; default: size_ = 0; // Error marker } } if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { read_amp_bitmap_.reset(new BlockReadAmpBitmap( restart_offset_, read_amp_bytes_per_bit, statistics)); } } void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, const Comparator* raw_ucmp) { protection_bytes_per_key_ = 0; if (protection_bytes_per_key > 0 && num_restarts_ > 0) { // NewDataIterator() is called with protection_bytes_per_key_ = 0. // This is intended since checksum is not constructed yet. // // We do not know global_seqno yet, so checksum computation and // verification all assume global_seqno = 0. // TODO(yuzhangyu): handle the implication of padding timestamp for kv // protection. std::unique_ptr iter{NewDataIterator( raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */, nullptr /* stats */, true /* block_contents_pinned */, true /* user_defined_timestamps_persisted */)}; if (iter->status().ok()) { block_restart_interval_ = iter->GetRestartInterval(); } uint32_t num_keys = 0; if (iter->status().ok()) { num_keys = iter->NumberOfKeys(block_restart_interval_); } if (iter->status().ok()) { checksum_size_ = num_keys * protection_bytes_per_key; kv_checksum_ = new char[(size_t)checksum_size_]; size_t i = 0; iter->SeekToFirst(); while (iter->Valid()) { GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, iter->key(), iter->value()); iter->Next(); i += protection_bytes_per_key; } assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); } if (!iter->status().ok()) { size_ = 0; // Error marker return; } protection_bytes_per_key_ = protection_bytes_per_key; } } void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, const Comparator* raw_ucmp, bool value_is_full, bool index_has_first_key) { protection_bytes_per_key_ = 0; if (num_restarts_ > 0 && protection_bytes_per_key > 0) { // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They // do not impact how the index block is parsed. During checksum // construction/verification, we use the entire key buffer from // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value // checksum, and the content of this buffer do not change for different // values of `global_seqno` or `key_includes_seq`. // TODO(yuzhangyu): handle the implication of padding timestamp for kv // protection. std::unique_ptr iter{NewIndexIterator( raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr, nullptr /* Statistics */, true /* total_order_seek */, index_has_first_key /* have_first_key */, false /* key_includes_seq */, value_is_full, true /* block_contents_pinned */, true /* user_defined_timestamps_persisted*/, nullptr /* prefix_index */)}; if (iter->status().ok()) { block_restart_interval_ = iter->GetRestartInterval(); } uint32_t num_keys = 0; if (iter->status().ok()) { num_keys = iter->NumberOfKeys(block_restart_interval_); } if (iter->status().ok()) { checksum_size_ = num_keys * protection_bytes_per_key; kv_checksum_ = new char[(size_t)checksum_size_]; iter->SeekToFirst(); size_t i = 0; while (iter->Valid()) { GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, iter->key(), iter->raw_value()); iter->Next(); i += protection_bytes_per_key; } assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); } if (!iter->status().ok()) { size_ = 0; // Error marker return; } protection_bytes_per_key_ = protection_bytes_per_key; } } void Block::InitializeMetaIndexBlockProtectionInfo( uint8_t protection_bytes_per_key) { protection_bytes_per_key_ = 0; if (num_restarts_ > 0 && protection_bytes_per_key > 0) { std::unique_ptr iter{ NewMetaIterator(true /* block_contents_pinned */)}; if (iter->status().ok()) { block_restart_interval_ = iter->GetRestartInterval(); } uint32_t num_keys = 0; if (iter->status().ok()) { num_keys = iter->NumberOfKeys(block_restart_interval_); } if (iter->status().ok()) { checksum_size_ = num_keys * protection_bytes_per_key; kv_checksum_ = new char[(size_t)checksum_size_]; iter->SeekToFirst(); size_t i = 0; while (iter->Valid()) { GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, iter->key(), iter->value()); iter->Next(); i += protection_bytes_per_key; } assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); } if (!iter->status().ok()) { size_ = 0; // Error marker return; } protection_bytes_per_key_ = protection_bytes_per_key; } } MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { MetaBlockIter* iter = new MetaBlockIter(); if (size_ < 2 * sizeof(uint32_t)) { iter->Invalidate(Status::Corruption("bad block contents")); return iter; } else if (num_restarts_ == 0) { // Empty block. iter->Invalidate(Status::OK()); } else { iter->Initialize(data_, restart_offset_, num_restarts_, block_contents_pinned, protection_bytes_per_key_, kv_checksum_, block_restart_interval_); } return iter; } DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, SequenceNumber global_seqno, DataBlockIter* iter, Statistics* stats, bool block_contents_pinned, bool user_defined_timestamps_persisted) { DataBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; } else { ret_iter = new DataBlockIter; } if (size_ < 2 * sizeof(uint32_t)) { ret_iter->Invalidate(Status::Corruption("bad block contents")); return ret_iter; } if (num_restarts_ == 0) { // Empty block. ret_iter->Invalidate(Status::OK()); return ret_iter; } else { ret_iter->Initialize( raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, user_defined_timestamps_persisted, data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, protection_bytes_per_key_, kv_checksum_, block_restart_interval_); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ read_amp_bitmap_->SetStatistics(stats); } } } return ret_iter; } IndexBlockIter* Block::NewIndexIterator( const Comparator* raw_ucmp, SequenceNumber global_seqno, IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, bool have_first_key, bool key_includes_seq, bool value_is_full, bool block_contents_pinned, bool user_defined_timestamps_persisted, BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; } else { ret_iter = new IndexBlockIter; } if (size_ < 2 * sizeof(uint32_t)) { ret_iter->Invalidate(Status::Corruption("bad block contents")); return ret_iter; } if (num_restarts_ == 0) { // Empty block. ret_iter->Invalidate(Status::OK()); return ret_iter; } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize( raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, prefix_index_ptr, have_first_key, key_includes_seq, value_is_full, block_contents_pinned, user_defined_timestamps_persisted, protection_bytes_per_key_, kv_checksum_, block_restart_interval_); } return ret_iter; } size_t Block::ApproximateMemoryUsage() const { size_t usage = usable_size(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE usage += malloc_usable_size((void*)this); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE if (read_amp_bitmap_) { usage += read_amp_bitmap_->ApproximateMemoryUsage(); } usage += checksum_size_; return usage; } } // namespace ROCKSDB_NAMESPACE