// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // // Decodes the blocks generated by block_builder.cc. #include "table/block.h" #include #include #include #include #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" #include "table/block_prefix_index.h" #include "table/format.h" #include "util/coding.h" #include "util/logging.h" namespace rocksdb { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and // "*value_length", respectively. Will not derefence past "limit". // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). struct DecodeEntry { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared, uint32_t* value_length) { // We need 2 bytes for shared and non_shared size. We also need one more // byte either for value size or the actual value in case of value delta // encoding. assert(limit - p >= 3); *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; *value_length = reinterpret_cast(p)[2]; if ((*shared | *non_shared | *value_length) < 128) { // Fast path: all three values are encoded in one byte each p += 3; } else { if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { return nullptr; } } // Using an assert in place of "return null" since we should not pay the // cost of checking for corruption on every single key decoding assert(!(static_cast(limit - p) < (*non_shared + *value_length))); return p; } }; struct DecodeKey { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared) { uint32_t value_length; return DecodeEntry()(p, limit, shared, non_shared, &value_length); } }; // In format_version 4, which is used by index blocks, the value size is not // encoded before the entry, as the value is known to be the handle with the // known size. struct DecodeKeyV4 { inline const char* operator()(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared) { // We need 2 bytes for shared and non_shared size. We also need one more // byte either for value size or the actual value in case of value delta // encoding. if (limit - p < 3) return nullptr; *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; if ((*shared | *non_shared) < 128) { // Fast path: all three values are encoded in one byte each p += 2; } else { if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; } return p; } }; void DataBlockIter::Next() { assert(Valid()); ParseNextDataKey(); } void IndexBlockIter::Next() { assert(Valid()); ParseNextIndexKey(); } void IndexBlockIter::Prev() { assert(Valid()); // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { if (restart_index_ == 0) { // No more entries current_ = restarts_; restart_index_ = num_restarts_; return; } restart_index_--; } SeekToRestartPoint(restart_index_); do { if (!ParseNextIndexKey()) { break; } // Loop until end of current entry hits the start of original entry } while (NextEntryOffset() < original); } // Similar to IndexBlockIter::Prev but also caches the prev entries void DataBlockIter::Prev() { assert(Valid()); assert(prev_entries_idx_ == -1 || static_cast(prev_entries_idx_) < prev_entries_.size()); // Check if we can use cached prev_entries_ if (prev_entries_idx_ > 0 && prev_entries_[prev_entries_idx_].offset == current_) { // Read cached CachedPrevEntry prev_entries_idx_--; const CachedPrevEntry& current_prev_entry = prev_entries_[prev_entries_idx_]; const char* key_ptr = nullptr; if (current_prev_entry.key_ptr != nullptr) { // The key is not delta encoded and stored in the data block key_ptr = current_prev_entry.key_ptr; key_pinned_ = true; } else { // The key is delta encoded and stored in prev_entries_keys_buff_ key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; key_pinned_ = false; } const Slice current_key(key_ptr, current_prev_entry.key_size); current_ = current_prev_entry.offset; key_.SetKey(current_key, false /* copy */); value_ = current_prev_entry.value; return; } // Clear prev entries cache prev_entries_idx_ = -1; prev_entries_.clear(); prev_entries_keys_buff_.clear(); // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { if (restart_index_ == 0) { // No more entries current_ = restarts_; restart_index_ = num_restarts_; return; } restart_index_--; } SeekToRestartPoint(restart_index_); do { if (!ParseNextDataKey()) { break; } Slice current_key = key(); if (key_.IsKeyPinned()) { // The key is not delta encoded prev_entries_.emplace_back(current_, current_key.data(), 0, current_key.size(), value()); } else { // The key is delta encoded, cache decoded key in buffer size_t new_key_offset = prev_entries_keys_buff_.size(); prev_entries_keys_buff_.append(current_key.data(), current_key.size()); prev_entries_.emplace_back(current_, nullptr, new_key_offset, current_key.size(), value()); } // Loop until end of current entry hits the start of original entry } while (NextEntryOffset() < original); prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; } void DataBlockIter::Seek(const Slice& target) { Slice seek_key = target; PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); if (!ok) { return; } SeekToRestartPoint(index); // Linear search (within restart block) for first key >= target while (true) { if (!ParseNextDataKey() || Compare(key_, seek_key) >= 0) { return; } } } void IndexBlockIter::Seek(const Slice& target) { Slice seek_key = target; if (!key_includes_seq_) { seek_key = ExtractUserKey(target); } PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool ok = false; if (prefix_index_) { ok = PrefixSeek(target, &index); } else if (value_delta_encoded_) { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_); } else { ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, active_comparator_); } if (!ok) { return; } SeekToRestartPoint(index); // Linear search (within restart block) for first key >= target while (true) { if (!ParseNextIndexKey() || Compare(key_, seek_key) >= 0) { return; } } } void DataBlockIter::SeekForPrev(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); Slice seek_key = target; if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool ok = BinarySeek(seek_key, 0, num_restarts_ - 1, &index, comparator_); if (!ok) { return; } SeekToRestartPoint(index); // Linear search (within restart block) for first key >= seek_key while (ParseNextDataKey() && Compare(key_, seek_key) < 0) { } if (!Valid()) { SeekToLast(); } else { while (Valid() && Compare(key_, seek_key) > 0) { Prev(); } } } void DataBlockIter::SeekToFirst() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); ParseNextDataKey(); } void IndexBlockIter::SeekToFirst() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); ParseNextIndexKey(); } void DataBlockIter::SeekToLast() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(num_restarts_ - 1); while (ParseNextDataKey() && NextEntryOffset() < restarts_) { // Keep skipping } } void IndexBlockIter::SeekToLast() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(num_restarts_ - 1); while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { // Keep skipping } } template void BlockIter::CorruptionError() { current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::Corruption("bad entry in block"); key_.Clear(); value_.clear(); } bool DataBlockIter::ParseNextDataKey() { current_ = NextEntryOffset(); const char* p = data_ + current_; const char* limit = data_ + restarts_; // Restarts come right after data if (p >= limit) { // No more entries to return. Mark as invalid. current_ = restarts_; restart_index_ = num_restarts_; return false; } // Decode next entry uint32_t shared, non_shared, value_length; p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; } else { if (shared == 0) { // If this key dont share any bytes with prev key then we dont need // to decode it and can use it's address in the block directly. key_.SetKey(Slice(p, non_shared), false /* copy */); key_pinned_ = true; } else { // This key share `shared` bytes with prev key, we need to decode it key_.TrimAppend(shared, p, non_shared); key_pinned_ = false; } if (global_seqno_ != kDisableGlobalSequenceNumber) { // If we are reading a file with a global sequence number we should // expect that all encoded sequence numbers are zeros and any value // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion. assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); ValueType value_type = ExtractValueType(key_.GetKey()); assert(value_type == ValueType::kTypeValue || value_type == ValueType::kTypeMerge || value_type == ValueType::kTypeDeletion || value_type == ValueType::kTypeRangeDeletion); if (key_pinned_) { // TODO(tec): Investigate updating the seqno in the loaded block // directly instead of doing a copy and update. // We cannot use the key address in the block directly because // we have a global_seqno_ that will overwrite the encoded one. key_.OwnKey(); key_pinned_ = false; } key_.UpdateInternalKey(global_seqno_, value_type); } value_ = Slice(p + non_shared, value_length); if (shared == 0) { while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_) { ++restart_index_; } } // else we are in the middle of a restart interval and the restart_index_ // thus has not changed return true; } } bool IndexBlockIter::ParseNextIndexKey() { current_ = NextEntryOffset(); const char* p = data_ + current_; const char* limit = data_ + restarts_; // Restarts come right after data if (p >= limit) { // No more entries to return. Mark as invalid. current_ = restarts_; restart_index_ = num_restarts_; return false; } // Decode next entry uint32_t shared, non_shared, value_length; if (value_delta_encoded_) { p = DecodeKeyV4()(p, limit, &shared, &non_shared); value_length = 0; } else { p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); } if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; } if (shared == 0) { // If this key dont share any bytes with prev key then we dont need // to decode it and can use it's address in the block directly. key_.SetKey(Slice(p, non_shared), false /* copy */); key_pinned_ = true; } else { // This key share `shared` bytes with prev key, we need to decode it key_.TrimAppend(shared, p, non_shared); key_pinned_ = false; } value_ = Slice(p + non_shared, value_length); if (shared == 0) { while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_) { ++restart_index_; } } // else we are in the middle of a restart interval and the restart_index_ // thus has not changed if (value_delta_encoded_) { assert(value_length == 0); DecodeCurrentValue(shared); } return true; } // The format: // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // ... // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // where, k is key, v is value, and its encoding is in parenthesis. // The format of each key is (shared_size, non_shared_size, shared, non_shared) // The format of each value, i.e., block hanlde, is (offset, size) whenever the // shared_size is 0, which included the first entry in each restart point. // Otherwise the format is delta-size = block handle size - size of last block // handle. void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { assert(value_delta_encoded_); const char* limit = data_ + restarts_; if (shared == 0) { uint64_t o, s; const char* newp = GetVarint64Ptr(value_.data(), limit, &o); assert(newp); newp = GetVarint64Ptr(newp, limit, &s); assert(newp); decoded_value_ = BlockHandle(o, s); value_ = Slice(value_.data(), newp - value_.data()); } else { uint64_t next_value_base = decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; int64_t delta; const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); decoded_value_ = BlockHandle(next_value_base, decoded_value_.size() + delta); value_ = Slice(value_.data(), newp - value_.data()); } } // Binary search in restart array to find the first restart point that // is either the last restart point with a key less than target, // which means the key of next restart point is larger than target, or // the first restart point with a key = target template template bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, uint32_t* index, const Comparator* comp) { assert(left <= right); while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); uint32_t shared, non_shared; const char* key_ptr = DecodeKeyFunc()( data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; } Slice mid_key(key_ptr, non_shared); int cmp = comp->Compare(mid_key, target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all // blocks before "mid" are uninteresting. left = mid; } else if (cmp > 0) { // Key at "mid" is >= "target". Therefore all blocks at or // after "mid" are uninteresting. right = mid - 1; } else { left = right = mid; } } *index = left; return true; } // Compare target key and the block key of the block of `block_index`. // Return -1 if error. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { uint32_t region_offset = GetRestartPoint(block_index); uint32_t shared, non_shared; const char* key_ptr = value_delta_encoded_ ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, &non_shared) : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, &non_shared); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return 1; // Return target is smaller } Slice block_key(key_ptr, non_shared); return Compare(block_key, target); } // Binary search in block_ids to find the first block // with a key >= target bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, uint32_t right, uint32_t* index) { assert(left <= right); uint32_t left_bound = left; while (left <= right) { uint32_t mid = (right + left) / 2; int cmp = CompareBlockKey(block_ids[mid], target); if (!status_.ok()) { return false; } if (cmp < 0) { // Key at "target" is larger than "mid". Therefore all // blocks before or at "mid" are uninteresting. left = mid + 1; } else { // Key at "target" is <= "mid". Therefore all blocks // after "mid" are uninteresting. // If there is only one block left, we found it. if (left == right) break; right = mid; } } if (left == right) { // In one of the two following cases: // (1) left is the first one of block_ids // (2) there is a gap of blocks between block of `left` and `left-1`. // we can further distinguish the case of key in the block or key not // existing, by comparing the target key and the key of the previous // block to the left of the block found. if (block_ids[left] > 0 && (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && CompareBlockKey(block_ids[left] - 1, target) > 0) { current_ = restarts_; return false; } *index = block_ids[left]; return true; } else { assert(left > right); // Mark iterator invalid current_ = restarts_; return false; } } bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) { assert(prefix_index_); Slice seek_key = target; if (!key_includes_seq_) { seek_key = ExtractUserKey(target); } uint32_t* block_ids = nullptr; uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); if (num_blocks == 0) { current_ = restarts_; return false; } else { return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index); } } uint32_t Block::NumRestarts() const { assert(size_ >= 2*sizeof(uint32_t)); return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, size_t read_amp_bytes_per_bit, Statistics* statistics) : contents_(std::move(contents)), data_(contents_.data.data()), size_(contents_.data.size()), restart_offset_(0), num_restarts_(0), global_seqno_(_global_seqno) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { // Should only decode restart points for uncompressed blocks if (compression_type() == kNoCompression) { num_restarts_ = NumRestarts(); restart_offset_ = static_cast(size_) - (1 + num_restarts_) * sizeof(uint32_t); if (restart_offset_ > size_ - sizeof(uint32_t)) { // The size is too small for NumRestarts() and therefore // restart_offset_ wrapped around. size_ = 0; } } } if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { read_amp_bitmap_.reset(new BlockReadAmpBitmap( restart_offset_, read_amp_bytes_per_bit, statistics)); } } template <> DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, DataBlockIter* iter, Statistics* stats, bool /*total_order_seek*/, bool /*key_includes_seq*/, bool /*value_is_full*/, BlockPrefixIndex* /*prefix_index*/) { DataBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; } else { ret_iter = new DataBlockIter; } if (size_ < 2 * sizeof(uint32_t)) { ret_iter->Invalidate(Status::Corruption("bad block contents")); return ret_iter; } if (num_restarts_ == 0) { // Empty block. ret_iter->Invalidate(Status::OK()); return ret_iter; } else { ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_, read_amp_bitmap_.get(), cachable()); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ read_amp_bitmap_->SetStatistics(stats); } } } return ret_iter; } template <> IndexBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, bool key_includes_seq, bool value_is_full, BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; } else { ret_iter = new IndexBlockIter; } if (size_ < 2 * sizeof(uint32_t)) { ret_iter->Invalidate(Status::Corruption("bad block contents")); return ret_iter; } if (num_restarts_ == 0) { // Empty block. ret_iter->Invalidate(Status::OK()); return ret_iter; } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, prefix_index_ptr, key_includes_seq, value_is_full, cachable()); } return ret_iter; } size_t Block::ApproximateMemoryUsage() const { size_t usage = usable_size(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE usage += malloc_usable_size((void*)this); #else usage += sizeof(*this); #endif // ROCKSDB_MALLOC_USABLE_SIZE if (read_amp_bitmap_) { usage += read_amp_bitmap_->ApproximateMemoryUsage(); } return usage; } } // namespace rocksdb