remove malloc when create data and index iterator in Get

Summary:
  Define Block::Iter to be an independent class to be used by block_based_table_reader
  When creating data and index iterator, update an existing iterator rather than new one
  Thus malloc and free could be reduced

Benchmark,
Base:
commit 76286ee67e
commands:
--db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=2621440 --use_hash_search=1 --block_size=1024 --block_restart_interval=1 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1

malloc: 3.30% -> 1.42%
free: 3.59%->1.61%

Test Plan:
  make all check
  run db_stress
  valgrind ./db_test ./table_test

Reviewers: ljin, yhchiang, dhruba, igor, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D20655
main
Feng Zhu 11 years ago
parent 76286ee67e
commit 8f09d53fd1
  1. 475
      table/block.cc
  2. 128
      table/block.h
  3. 94
      table/block_based_table_reader.cc
  4. 10
      table/block_based_table_reader.h

@ -17,44 +17,14 @@
#include <vector> #include <vector>
#include "rocksdb/comparator.h" #include "rocksdb/comparator.h"
#include "table/format.h"
#include "table/block_hash_index.h" #include "table/block_hash_index.h"
#include "table/block_prefix_index.h" #include "table/block_prefix_index.h"
#include "table/format.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/logging.h" #include "util/logging.h"
#include "db/dbformat.h"
namespace rocksdb { namespace rocksdb {
uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const BlockContents& contents)
: data_(contents.data.data()),
size_(contents.data.size()),
owned_(contents.heap_allocated),
cachable_(contents.cachable),
compression_type_(contents.compression_type) {
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
if (restart_offset_ > size_ - sizeof(uint32_t)) {
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size_ = 0;
}
}
}
Block::~Block() {
if (owned_) {
delete[] data_;
}
}
// Helper routine: decode the next block entry starting at "p", // Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes, // storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and // and the length of the value in "*shared", "*non_shared", and
@ -85,142 +55,85 @@ static inline const char* DecodeEntry(const char* p, const char* limit,
return p; return p;
} }
class Block::Iter : public Iterator { void BlockIter::Next() {
private: assert(Valid());
const Comparator* const comparator_; ParseNextKey();
const char* const data_; // underlying block contents }
uint32_t const restarts_; // Offset of restart array (list of fixed32)
uint32_t const num_restarts_; // Number of uint32_t entries in restart array
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
uint32_t restart_index_; // Index of restart block in which current_ falls
IterKey key_;
Slice value_;
Status status_;
BlockHashIndex* hash_index_;
BlockPrefixIndex* prefix_index_;
inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
}
// Return the offset in data_ just past the end of the current entry.
inline uint32_t NextEntryOffset() const {
return (value_.data() + value_.size()) - data_;
}
uint32_t GetRestartPoint(uint32_t index) {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
void SeekToRestartPoint(uint32_t index) { void BlockIter::Prev() {
key_.Clear(); assert(Valid());
restart_index_ = index;
// current_ will be fixed by ParseNextKey();
// ParseNextKey() starts at the end of value_, so set value_ accordingly // Scan backwards to a restart point before current_
uint32_t offset = GetRestartPoint(index); const uint32_t original = current_;
value_ = Slice(data_ + offset, 0); while (GetRestartPoint(restart_index_) >= original) {
if (restart_index_ == 0) {
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
} }
public: SeekToRestartPoint(restart_index_);
Iter(const Comparator* comparator, const char* data, uint32_t restarts, do {
uint32_t num_restarts, BlockHashIndex* hash_index, // Loop until end of current entry hits the start of original entry
BlockPrefixIndex* prefix_index) } while (ParseNextKey() && NextEntryOffset() < original);
: comparator_(comparator), }
data_(data),
restarts_(restarts),
num_restarts_(num_restarts),
current_(restarts_),
restart_index_(num_restarts_),
hash_index_(hash_index),
prefix_index_(prefix_index) {
assert(num_restarts_ > 0);
}
virtual bool Valid() const { return current_ < restarts_; } void BlockIter::Seek(const Slice& target) {
virtual Status status() const { return status_; } if (data_ == nullptr) { // Not init yet
virtual Slice key() const { return;
assert(Valid());
return key_.GetKey();
} }
virtual Slice value() const { uint32_t index = 0;
assert(Valid()); bool ok = false;
return value_; if (prefix_index_) {
} ok = PrefixSeek(target, &index);
} else {
virtual void Next() { ok = hash_index_ ? HashSeek(target, &index)
assert(Valid()); : BinarySeek(target, 0, num_restarts_ - 1, &index);
ParseNextKey();
} }
virtual void Prev() { if (!ok) {
assert(Valid()); return;
// Scan backwards to a restart point before current_
const uint32_t original = current_;
while (GetRestartPoint(restart_index_) >= original) {
if (restart_index_ == 0) {
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
}
SeekToRestartPoint(restart_index_);
do {
// Loop until end of current entry hits the start of original entry
} while (ParseNextKey() && NextEntryOffset() < original);
} }
SeekToRestartPoint(index);
// Linear search (within restart block) for first key >= target
virtual void Seek(const Slice& target) { while (true) {
uint32_t index = 0; if (!ParseNextKey() || Compare(key_.GetKey(), target) >= 0) {
bool ok = false;
if (prefix_index_) {
ok = PrefixSeek(target, &index);
} else {
ok = hash_index_ ? HashSeek(target, &index)
: BinarySeek(target, 0, num_restarts_ - 1, &index);
}
if (!ok) {
return; return;
} }
SeekToRestartPoint(index);
// Linear search (within restart block) for first key >= target
while (true) {
if (!ParseNextKey() || Compare(key_.GetKey(), target) >= 0) {
return;
}
}
}
virtual void SeekToFirst() {
SeekToRestartPoint(0);
ParseNextKey();
} }
}
virtual void SeekToLast() { void BlockIter::SeekToFirst() {
SeekToRestartPoint(num_restarts_ - 1); if (data_ == nullptr) { // Not init yet
while (ParseNextKey() && NextEntryOffset() < restarts_) { return;
// Keep skipping
}
} }
SeekToRestartPoint(0);
ParseNextKey();
}
private: void BlockIter::SeekToLast() {
void CorruptionError() { if (data_ == nullptr) { // Not init yet
current_ = restarts_; return;
restart_index_ = num_restarts_; }
status_ = Status::Corruption("bad entry in block"); SeekToRestartPoint(num_restarts_ - 1);
key_.Clear(); while (ParseNextKey() && NextEntryOffset() < restarts_) {
value_.clear(); // Keep skipping
} }
}
void BlockIter::CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
key_.Clear();
value_.clear();
}
bool ParseNextKey() { bool BlockIter::ParseNextKey() {
current_ = NextEntryOffset(); current_ = NextEntryOffset();
const char* p = data_ + current_; const char* p = data_ + current_;
const char* limit = data_ + restarts_; // Restarts come right after data const char* limit = data_ + restarts_; // Restarts come right after data
@ -248,150 +161,194 @@ class Block::Iter : public Iterator {
} }
} }
// Binary search in restart array to find the first restart point // Binary search in restart array to find the first restart point
// with a key >= target (TODO: this comment is inaccurate) // with a key >= target (TODO: this comment is inaccurate)
bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
uint32_t* index) { uint32_t* index) {
assert(left <= right); assert(left <= right);
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr =
DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
&non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return false;
}
Slice mid_key(key_ptr, non_shared);
int cmp = Compare(mid_key, target);
if (cmp < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else if (cmp > 0) {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
} else {
left = right = mid;
}
}
*index = left; while (left < right) {
return true; uint32_t mid = (left + right + 1) / 2;
} uint32_t region_offset = GetRestartPoint(mid);
// Compare target key and the block key of the block of `block_index`.
// Return -1 if error.
int CompareBlockKey(uint32_t block_index, const Slice& target) {
uint32_t region_offset = GetRestartPoint(block_index);
uint32_t shared, non_shared, value_length; uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, const char* key_ptr =
&shared, &non_shared, &value_length); DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
&non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) { if (key_ptr == nullptr || (shared != 0)) {
CorruptionError(); CorruptionError();
return 1; // Return target is smaller return false;
}
Slice mid_key(key_ptr, non_shared);
int cmp = Compare(mid_key, target);
if (cmp < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else if (cmp > 0) {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
} else {
left = right = mid;
} }
Slice block_key(key_ptr, non_shared);
return Compare(block_key, target);
} }
// Binary search in block_ids to find the first block *index = left;
// with a key >= target return true;
bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, }
uint32_t left, uint32_t right,
uint32_t* index) {
assert(left <= right);
uint32_t left_bound = left;
while (left <= right) { // Compare target key and the block key of the block of `block_index`.
uint32_t mid = (left + right) / 2; // Return -1 if error.
int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
uint32_t region_offset = GetRestartPoint(block_index);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
&shared, &non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return 1; // Return target is smaller
}
Slice block_key(key_ptr, non_shared);
return Compare(block_key, target);
}
int cmp = CompareBlockKey(block_ids[mid], target); // Binary search in block_ids to find the first block
if (!status_.ok()) { // with a key >= target
return false; bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
} uint32_t left, uint32_t right,
if (cmp < 0) { uint32_t* index) {
// Key at "target" is larger than "mid". Therefore all assert(left <= right);
// blocks before or at "mid" are uninteresting. uint32_t left_bound = left;
left = mid + 1;
} else {
// Key at "target" is <= "mid". Therefore all blocks
// after "mid" are uninteresting.
// If there is only one block left, we found it.
if (left == right) break;
right = mid;
}
}
if (left == right) { while (left <= right) {
// In one of the two following cases: uint32_t mid = (left + right) / 2;
// (1) left is the first one of block_ids
// (2) there is a gap of blocks between block of `left` and `left-1`.
// we can further distinguish the case of key in the block or key not
// existing, by comparing the target key and the key of the previous
// block to the left of the block found.
if (block_ids[left] > 0 &&
(left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
CompareBlockKey(block_ids[left] - 1, target) > 0) {
current_ = restarts_;
return false;
}
*index = block_ids[left]; int cmp = CompareBlockKey(block_ids[mid], target);
return true; if (!status_.ok()) {
} else {
assert(left > right);
// Mark iterator invalid
current_ = restarts_;
return false; return false;
} }
if (cmp < 0) {
// Key at "target" is larger than "mid". Therefore all
// blocks before or at "mid" are uninteresting.
left = mid + 1;
} else {
// Key at "target" is <= "mid". Therefore all blocks
// after "mid" are uninteresting.
// If there is only one block left, we found it.
if (left == right) break;
right = mid;
}
} }
bool HashSeek(const Slice& target, uint32_t* index) { if (left == right) {
assert(hash_index_); // In one of the two following cases:
auto restart_index = hash_index_->GetRestartIndex(target); // (1) left is the first one of block_ids
if (restart_index == nullptr) { // (2) there is a gap of blocks between block of `left` and `left-1`.
// we can further distinguish the case of key in the block or key not
// existing, by comparing the target key and the key of the previous
// block to the left of the block found.
if (block_ids[left] > 0 &&
(left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
CompareBlockKey(block_ids[left] - 1, target) > 0) {
current_ = restarts_; current_ = restarts_;
return false; return false;
} }
// the elements in restart_array[index : index + num_blocks] *index = block_ids[left];
// are all with same prefix. We'll do binary search in that small range. return true;
auto left = restart_index->first_index; } else {
auto right = restart_index->first_index + restart_index->num_blocks - 1; assert(left > right);
return BinarySeek(target, left, right, index); // Mark iterator invalid
current_ = restarts_;
return false;
}
}
bool BlockIter::HashSeek(const Slice& target, uint32_t* index) {
assert(hash_index_);
auto restart_index = hash_index_->GetRestartIndex(target);
if (restart_index == nullptr) {
current_ = restarts_;
return false;
} }
bool PrefixSeek(const Slice& target, uint32_t* index) { // the elements in restart_array[index : index + num_blocks]
assert(prefix_index_); // are all with same prefix. We'll do binary search in that small range.
uint32_t* block_ids = nullptr; auto left = restart_index->first_index;
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); auto right = restart_index->first_index + restart_index->num_blocks - 1;
return BinarySeek(target, left, right, index);
}
bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
assert(prefix_index_);
uint32_t* block_ids = nullptr;
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
if (num_blocks == 0) { if (num_blocks == 0) {
current_ = restarts_; current_ = restarts_;
return false; return false;
} else { } else {
return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
}
}
uint32_t Block::NumRestarts() const {
assert(size_ >= 2*sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const BlockContents& contents)
: data_(contents.data.data()),
size_(contents.data.size()),
owned_(contents.heap_allocated),
cachable_(contents.cachable),
compression_type_(contents.compression_type) {
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
if (restart_offset_ > size_ - sizeof(uint32_t)) {
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size_ = 0;
} }
} }
}; }
Block::~Block() {
if (owned_) {
delete[] data_;
}
}
Iterator* Block::NewIterator(const Comparator* cmp) { Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
if (size_ < 2*sizeof(uint32_t)) { if (size_ < 2*sizeof(uint32_t)) {
return NewErrorIterator(Status::Corruption("bad block contents")); if (iter != nullptr) {
iter->SetStatus(Status::Corruption("bad block contents"));
return iter;
} else {
return NewErrorIterator(Status::Corruption("bad block contents"));
}
} }
const uint32_t num_restarts = NumRestarts(); const uint32_t num_restarts = NumRestarts();
if (num_restarts == 0) { if (num_restarts == 0) {
return NewEmptyIterator(); if (iter != nullptr) {
iter->SetStatus(Status::OK());
return iter;
} else {
return NewEmptyIterator();
}
} else { } else {
return new Iter(cmp, data_, restart_offset_, num_restarts, if (iter != nullptr) {
iter->Initialize(cmp, data_, restart_offset_, num_restarts,
hash_index_.get(), prefix_index_.get()); hash_index_.get(), prefix_index_.get());
} else {
iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
hash_index_.get(), prefix_index_.get());
}
} }
return iter;
} }
void Block::SetBlockHashIndex(BlockHashIndex* hash_index) { void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {

@ -13,11 +13,13 @@
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "db/dbformat.h"
namespace rocksdb { namespace rocksdb {
struct BlockContents; struct BlockContents;
class Comparator; class Comparator;
class BlockIter;
class BlockHashIndex; class BlockHashIndex;
class BlockPrefixIndex; class BlockPrefixIndex;
@ -40,7 +42,11 @@ class Block {
// NOTE: for the hash based lookup, if a key prefix doesn't match any key, // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
// the iterator will simply be set as "invalid", rather than returning // the iterator will simply be set as "invalid", rather than returning
// the key that is just pass the target key. // the key that is just pass the target key.
Iterator* NewIterator(const Comparator* comparator); //
// If iter is null, return new Iterator
// If iter is not null, update this one and return it as Iterator*
Iterator* NewIterator(const Comparator* comparator,
BlockIter* iter = nullptr);
void SetBlockHashIndex(BlockHashIndex* hash_index); void SetBlockHashIndex(BlockHashIndex* hash_index);
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index); void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
@ -57,8 +63,126 @@ class Block {
// No copying allowed // No copying allowed
Block(const Block&); Block(const Block&);
void operator=(const Block&); void operator=(const Block&);
};
class BlockIter : public Iterator {
public:
BlockIter()
: comparator_(nullptr),
data_(nullptr),
restarts_(0),
num_restarts_(0),
current_(0),
restart_index_(0),
status_(Status::OK()),
hash_index_(nullptr),
prefix_index_(nullptr) {}
BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
uint32_t num_restarts, BlockHashIndex* hash_index,
BlockPrefixIndex* prefix_index)
: BlockIter() {
Initialize(comparator, data, restarts, num_restarts,
hash_index, prefix_index);
}
void Initialize(const Comparator* comparator, const char* data,
uint32_t restarts, uint32_t num_restarts, BlockHashIndex* hash_index,
BlockPrefixIndex* prefix_index) {
assert(data_ == nullptr); // Ensure it is called only once
assert(num_restarts > 0); // Ensure the param is valid
comparator_ = comparator;
data_ = data;
restarts_ = restarts;
num_restarts_ = num_restarts;
current_ = restarts_;
restart_index_ = num_restarts_;
hash_index_ = hash_index;
prefix_index_ = prefix_index;
}
void SetStatus(Status s) {
status_ = s;
}
virtual bool Valid() const override { return current_ < restarts_; }
virtual Status status() const override { return status_; }
virtual Slice key() const override {
assert(Valid());
return key_.GetKey();
}
virtual Slice value() const override {
assert(Valid());
return value_;
}
virtual void Next() override;
virtual void Prev() override;
virtual void Seek(const Slice& target) override;
virtual void SeekToFirst() override;
virtual void SeekToLast() override;
private:
const Comparator* comparator_;
const char* data_; // underlying block contents
uint32_t restarts_; // Offset of restart array (list of fixed32)
uint32_t num_restarts_; // Number of uint32_t entries in restart array
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
uint32_t restart_index_; // Index of restart block in which current_ falls
IterKey key_;
Slice value_;
Status status_;
BlockHashIndex* hash_index_;
BlockPrefixIndex* prefix_index_;
inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
}
// Return the offset in data_ just past the end of the current entry.
inline uint32_t NextEntryOffset() const {
return (value_.data() + value_.size()) - data_;
}
uint32_t GetRestartPoint(uint32_t index) {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
void SeekToRestartPoint(uint32_t index) {
key_.Clear();
restart_index_ = index;
// current_ will be fixed by ParseNextKey();
// ParseNextKey() starts at the end of value_, so set value_ accordingly
uint32_t offset = GetRestartPoint(index);
value_ = Slice(data_ + offset, 0);
}
void CorruptionError();
bool ParseNextKey();
bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
uint32_t* index);
int CompareBlockKey(uint32_t block_index, const Slice& target);
bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
uint32_t left, uint32_t right,
uint32_t* index);
bool HashSeek(const Slice& target, uint32_t* index);
bool PrefixSeek(const Slice& target, uint32_t* index);
class Iter;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -135,7 +135,9 @@ class BlockBasedTable::IndexReader {
virtual ~IndexReader() {} virtual ~IndexReader() {}
// Create an iterator for index access. // Create an iterator for index access.
virtual Iterator* NewIterator() = 0; // An iter is passed in, if it is not null, update this one and return it
// If it is null, create a new Iterator
virtual Iterator* NewIterator(BlockIter* iter = nullptr) = 0;
// The size of the index. // The size of the index.
virtual size_t size() const = 0; virtual size_t size() const = 0;
@ -168,8 +170,8 @@ class BinarySearchIndexReader : public IndexReader {
return s; return s;
} }
virtual Iterator* NewIterator() override { virtual Iterator* NewIterator(BlockIter* iter = nullptr) override {
return index_block_->NewIterator(comparator_); return index_block_->NewIterator(comparator_, iter);
} }
virtual size_t size() const override { return index_block_->size(); } virtual size_t size() const override { return index_block_->size(); }
@ -284,8 +286,8 @@ class HashIndexReader : public IndexReader {
return Status::OK(); return Status::OK();
} }
virtual Iterator* NewIterator() override { virtual Iterator* NewIterator(BlockIter* iter = nullptr) override {
return index_block_->NewIterator(comparator_); return index_block_->NewIterator(comparator_, iter);
} }
virtual size_t size() const override { return index_block_->size(); } virtual size_t size() const override { return index_block_->size(); }
@ -779,10 +781,11 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
return { filter, cache_handle }; return { filter, cache_handle };
} }
Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) { Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
BlockIter* input_iter) {
// index reader has already been pre-populated. // index reader has already been pre-populated.
if (rep_->index_reader) { if (rep_->index_reader) {
return rep_->index_reader->NewIterator(); return rep_->index_reader->NewIterator(input_iter);
} }
bool no_io = read_options.read_tier == kBlockCacheTier; bool no_io = read_options.read_tier == kBlockCacheTier;
@ -796,7 +799,12 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
BLOCK_CACHE_INDEX_HIT, statistics); BLOCK_CACHE_INDEX_HIT, statistics);
if (cache_handle == nullptr && no_io) { if (cache_handle == nullptr && no_io) {
return NewErrorIterator(Status::Incomplete("no blocking io")); if (input_iter != nullptr) {
input_iter->SetStatus(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorIterator(Status::Incomplete("no blocking io"));
}
} }
IndexReader* index_reader = nullptr; IndexReader* index_reader = nullptr;
@ -811,7 +819,12 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
if (!s.ok()) { if (!s.ok()) {
// make sure if something goes wrong, index_reader shall remain intact. // make sure if something goes wrong, index_reader shall remain intact.
assert(index_reader == nullptr); assert(index_reader == nullptr);
return NewErrorIterator(s); if (input_iter != nullptr) {
input_iter->SetStatus(s);
return input_iter;
} else {
return NewErrorIterator(s);
}
} }
cache_handle = block_cache->Insert(key, index_reader, index_reader->size(), cache_handle = block_cache->Insert(key, index_reader, index_reader->size(),
@ -820,7 +833,8 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
} }
assert(cache_handle); assert(cache_handle);
auto iter = index_reader->NewIterator(); Iterator* iter;
iter = index_reader->NewIterator(input_iter);
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
return iter; return iter;
@ -828,8 +842,11 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
// Convert an index iterator value (i.e., an encoded BlockHandle) // Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block. // into an iterator over the contents of the corresponding block.
// If input_iter is null, new a iterator
// If input_iter is not null, update this iter and return it
Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
const ReadOptions& ro, const Slice& index_value) { const ReadOptions& ro, const Slice& index_value,
BlockIter* input_iter) {
const bool no_io = (ro.read_tier == kBlockCacheTier); const bool no_io = (ro.read_tier == kBlockCacheTier);
Cache* block_cache = rep->options.block_cache.get(); Cache* block_cache = rep->options.block_cache.get();
Cache* block_cache_compressed = rep->options. Cache* block_cache_compressed = rep->options.
@ -843,7 +860,12 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
Status s = handle.DecodeFrom(&input); Status s = handle.DecodeFrom(&input);
if (!s.ok()) { if (!s.ok()) {
return NewErrorIterator(s); if (input_iter != nullptr) {
input_iter->SetStatus(s);
return input_iter;
} else {
return NewErrorIterator(s);
}
} }
// If either block cache is enabled, we'll try to read from it. // If either block cache is enabled, we'll try to read from it.
@ -889,7 +911,12 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
if (block.value == nullptr) { if (block.value == nullptr) {
if (no_io) { if (no_io) {
// Could not read from block_cache and can't do IO // Could not read from block_cache and can't do IO
return NewErrorIterator(Status::Incomplete("no blocking io")); if (input_iter != nullptr) {
input_iter->SetStatus(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorIterator(Status::Incomplete("no blocking io"));
}
} }
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
&block.value, rep->options.env); &block.value, rep->options.env);
@ -897,15 +924,20 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
Iterator* iter; Iterator* iter;
if (block.value != nullptr) { if (block.value != nullptr) {
iter = block.value->NewIterator(&rep->internal_comparator); iter = block.value->NewIterator(&rep->internal_comparator, input_iter);
if (block.cache_handle != nullptr) { if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle); block.cache_handle);
} else { } else {
iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr); iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
} }
} else { } else {
iter = NewErrorIterator(s); if (input_iter != nullptr) {
input_iter->SetStatus(s);
iter = input_iter;
} else {
iter = NewErrorIterator(s);
}
} }
return iter; return iter;
} }
@ -1023,12 +1055,14 @@ Status BlockBasedTable::Get(
const Slice& v), const Slice& v),
void (*mark_key_may_exist_handler)(void* handle_context)) { void (*mark_key_may_exist_handler)(void* handle_context)) {
Status s; Status s;
Iterator* iiter = NewIndexIterator(read_options); BlockIter iiter;
NewIndexIterator(read_options, &iiter);
auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
FilterBlockReader* filter = filter_entry.value; FilterBlockReader* filter = filter_entry.value;
bool done = false; bool done = false;
for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) {
Slice handle_value = iiter->value(); Slice handle_value = iiter.value();
BlockHandle handle; BlockHandle handle;
bool may_not_exist_in_filter = bool may_not_exist_in_filter =
@ -1043,39 +1077,43 @@ Status BlockBasedTable::Get(
RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL);
break; break;
} else { } else {
unique_ptr<Iterator> block_iter( BlockIter biter;
NewDataBlockIterator(rep_, read_options, iiter->value())); NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
if (read_options.read_tier && block_iter->status().IsIncomplete()) { if (read_options.read_tier && biter.status().IsIncomplete()) {
// couldn't get block from block_cache // couldn't get block from block_cache
// Update Saver.state to Found because we are only looking for whether // Update Saver.state to Found because we are only looking for whether
// we can guarantee the key is not there when "no_io" is set // we can guarantee the key is not there when "no_io" is set
(*mark_key_may_exist_handler)(handle_context); (*mark_key_may_exist_handler)(handle_context);
break; break;
} }
if (!biter.status().ok()) {
s = biter.status();
break;
}
// Call the *saver function on each entry/block until it returns false // Call the *saver function on each entry/block until it returns false
for (block_iter->Seek(key); block_iter->Valid(); block_iter->Next()) { for (biter.Seek(key); biter.Valid(); biter.Next()) {
ParsedInternalKey parsed_key; ParsedInternalKey parsed_key;
if (!ParseInternalKey(block_iter->key(), &parsed_key)) { if (!ParseInternalKey(biter.key(), &parsed_key)) {
s = Status::Corruption(Slice()); s = Status::Corruption(Slice());
} }
if (!(*result_handler)(handle_context, parsed_key, if (!(*result_handler)(handle_context, parsed_key,
block_iter->value())) { biter.value())) {
done = true; done = true;
break; break;
} }
} }
s = block_iter->status(); s = biter.status();
} }
} }
filter_entry.Release(rep_->options.block_cache.get()); filter_entry.Release(rep_->options.block_cache.get());
if (s.ok()) { if (s.ok()) {
s = iiter->status(); s = iiter.status();
} }
delete iiter;
return s; return s;
} }

@ -23,6 +23,7 @@
namespace rocksdb { namespace rocksdb {
class Block; class Block;
class BlockIter;
class BlockHandle; class BlockHandle;
class Cache; class Cache;
class FilterBlockReader; class FilterBlockReader;
@ -111,8 +112,10 @@ class BlockBasedTable : public TableReader {
bool compaction_optimized_; bool compaction_optimized_;
class BlockEntryIteratorState; class BlockEntryIteratorState;
// input_iter: if it is not null, update this one and return it as Iterator
static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, static Iterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
const Slice& index_value); const Slice& index_value,
BlockIter* input_iter = nullptr);
// For the following two functions: // For the following two functions:
// if `no_io == true`, we will not try to read filter/index from sst file // if `no_io == true`, we will not try to read filter/index from sst file
@ -120,6 +123,8 @@ class BlockBasedTable : public TableReader {
CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const; CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
// Get the iterator from the index reader. // Get the iterator from the index reader.
// If input_iter is not set, return new Iterator
// If input_iter is set, update it and return it as Iterator
// //
// Note: ErrorIterator with Status::Incomplete shall be returned if all the // Note: ErrorIterator with Status::Incomplete shall be returned if all the
// following conditions are met: // following conditions are met:
@ -127,7 +132,8 @@ class BlockBasedTable : public TableReader {
// 2. index is not present in block cache. // 2. index is not present in block cache.
// 3. We disallowed any io to be performed, that is, read_options == // 3. We disallowed any io to be performed, that is, read_options ==
// kBlockCacheTier // kBlockCacheTier
Iterator* NewIndexIterator(const ReadOptions& read_options); Iterator* NewIndexIterator(const ReadOptions& read_options,
BlockIter* input_iter = nullptr);
// Read block cache from block caches (if set): block_cache and // Read block cache from block caches (if set): block_cache and
// block_cache_compressed. // block_cache_compressed.

Loading…
Cancel
Save