|
|
|
@ -37,6 +37,7 @@ |
|
|
|
|
#include "table/get_context.h" |
|
|
|
|
#include "table/internal_iterator.h" |
|
|
|
|
#include "table/meta_blocks.h" |
|
|
|
|
#include "table/partitioned_filter_block.h" |
|
|
|
|
#include "table/persistent_cache_helper.h" |
|
|
|
|
#include "table/sst_file_writer_collectors.h" |
|
|
|
|
#include "table/two_level_iterator.h" |
|
|
|
@ -149,7 +150,7 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, |
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
// Index that allows binary search lookup in a two-level index structure.
|
|
|
|
|
class PartitionIndexReader : public IndexReader { |
|
|
|
|
class PartitionIndexReader : public IndexReader, public Cleanable { |
|
|
|
|
public: |
|
|
|
|
// Read the partition index from the file and create an instance for
|
|
|
|
|
// `PartitionIndexReader`.
|
|
|
|
@ -159,7 +160,8 @@ class PartitionIndexReader : public IndexReader { |
|
|
|
|
const Footer& footer, const BlockHandle& index_handle, |
|
|
|
|
const ImmutableCFOptions& ioptions, |
|
|
|
|
const Comparator* comparator, IndexReader** index_reader, |
|
|
|
|
const PersistentCacheOptions& cache_options) { |
|
|
|
|
const PersistentCacheOptions& cache_options, |
|
|
|
|
const int level) { |
|
|
|
|
std::unique_ptr<Block> index_block; |
|
|
|
|
auto s = ReadBlockFromFile( |
|
|
|
|
file, footer, ReadOptions(), index_handle, &index_block, ioptions, |
|
|
|
@ -167,8 +169,9 @@ class PartitionIndexReader : public IndexReader { |
|
|
|
|
kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */); |
|
|
|
|
|
|
|
|
|
if (s.ok()) { |
|
|
|
|
*index_reader = new PartitionIndexReader( |
|
|
|
|
table, comparator, std::move(index_block), ioptions.statistics); |
|
|
|
|
*index_reader = |
|
|
|
|
new PartitionIndexReader(table, comparator, std::move(index_block), |
|
|
|
|
ioptions.statistics, level); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return s; |
|
|
|
@ -177,10 +180,25 @@ class PartitionIndexReader : public IndexReader { |
|
|
|
|
// return a two-level iterator: first level is on the partition index
|
|
|
|
|
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, |
|
|
|
|
bool dont_care = true) override { |
|
|
|
|
// Filters are already checked before seeking the index
|
|
|
|
|
const bool skip_filters = true; |
|
|
|
|
const bool is_index = true; |
|
|
|
|
Cleanable* block_cache_cleaner = nullptr; |
|
|
|
|
const bool pin_cached_indexes = |
|
|
|
|
level_ == 0 && |
|
|
|
|
table_->rep_->table_options.pin_l0_filter_and_index_blocks_in_cache; |
|
|
|
|
if (pin_cached_indexes) { |
|
|
|
|
// Keep partition indexes into the cache as long as the partition index
|
|
|
|
|
// reader object is alive
|
|
|
|
|
block_cache_cleaner = this; |
|
|
|
|
} |
|
|
|
|
return NewTwoLevelIterator( |
|
|
|
|
new BlockBasedTable::BlockEntryIteratorState(table_, ReadOptions(), |
|
|
|
|
false), |
|
|
|
|
index_block_->NewIterator(comparator_, iter, true)); |
|
|
|
|
new BlockBasedTable::BlockEntryIteratorState( |
|
|
|
|
table_, ReadOptions(), skip_filters, is_index, block_cache_cleaner), |
|
|
|
|
index_block_->NewIterator(comparator_, nullptr, true)); |
|
|
|
|
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of
|
|
|
|
|
// on-stack
|
|
|
|
|
// BlockIter while the state is on heap
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
virtual size_t size() const override { return index_block_->size(); } |
|
|
|
@ -195,14 +213,17 @@ class PartitionIndexReader : public IndexReader { |
|
|
|
|
|
|
|
|
|
private: |
|
|
|
|
PartitionIndexReader(BlockBasedTable* table, const Comparator* comparator, |
|
|
|
|
std::unique_ptr<Block>&& index_block, Statistics* stats) |
|
|
|
|
std::unique_ptr<Block>&& index_block, Statistics* stats, |
|
|
|
|
const int level) |
|
|
|
|
: IndexReader(comparator, stats), |
|
|
|
|
table_(table), |
|
|
|
|
index_block_(std::move(index_block)) { |
|
|
|
|
index_block_(std::move(index_block)), |
|
|
|
|
level_(level) { |
|
|
|
|
assert(index_block_ != nullptr); |
|
|
|
|
} |
|
|
|
|
BlockBasedTable* table_; |
|
|
|
|
std::unique_ptr<Block> index_block_; |
|
|
|
|
int level_; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
// Index that allows binary search lookup for the first key of each block.
|
|
|
|
@ -555,14 +576,28 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, |
|
|
|
|
|
|
|
|
|
// Find filter handle and filter type
|
|
|
|
|
if (rep->filter_policy) { |
|
|
|
|
for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) { |
|
|
|
|
for (auto filter_type : |
|
|
|
|
{Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, |
|
|
|
|
Rep::FilterType::kBlockFilter}) { |
|
|
|
|
std::string prefix; |
|
|
|
|
switch (filter_type) { |
|
|
|
|
case Rep::FilterType::kFullFilter: |
|
|
|
|
prefix = kFullFilterBlockPrefix; |
|
|
|
|
break; |
|
|
|
|
case Rep::FilterType::kPartitionedFilter: |
|
|
|
|
prefix = kPartitionedFilterBlockPrefix; |
|
|
|
|
break; |
|
|
|
|
case Rep::FilterType::kBlockFilter: |
|
|
|
|
prefix = kFilterBlockPrefix; |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
assert(0); |
|
|
|
|
} |
|
|
|
|
std::string filter_block_key = prefix; |
|
|
|
|
filter_block_key.append(rep->filter_policy->Name()); |
|
|
|
|
if (FindMetaBlock(meta_iter.get(), filter_block_key, &rep->filter_handle) |
|
|
|
|
.ok()) { |
|
|
|
|
rep->filter_type = (prefix == kFullFilterBlockPrefix) |
|
|
|
|
? Rep::FilterType::kFullFilter |
|
|
|
|
: Rep::FilterType::kBlockFilter; |
|
|
|
|
rep->filter_type = filter_type; |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -697,6 +732,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, |
|
|
|
|
if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache && |
|
|
|
|
level == 0) { |
|
|
|
|
rep->filter_entry = filter_entry; |
|
|
|
|
rep->filter_entry.value->SetLevel(level); |
|
|
|
|
} else { |
|
|
|
|
filter_entry.Release(table_options.block_cache.get()); |
|
|
|
|
} |
|
|
|
@ -707,14 +743,19 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, |
|
|
|
|
// pre-load these blocks, which will kept in member variables in Rep
|
|
|
|
|
// and with a same life-time as this table object.
|
|
|
|
|
IndexReader* index_reader = nullptr; |
|
|
|
|
s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); |
|
|
|
|
s = new_table->CreateIndexReader(&index_reader, meta_iter.get(), level); |
|
|
|
|
|
|
|
|
|
if (s.ok()) { |
|
|
|
|
rep->index_reader.reset(index_reader); |
|
|
|
|
|
|
|
|
|
// Set filter block
|
|
|
|
|
if (rep->filter_policy) { |
|
|
|
|
rep->filter.reset(new_table->ReadFilter(rep)); |
|
|
|
|
const bool is_a_filter_partition = true; |
|
|
|
|
rep->filter.reset( |
|
|
|
|
new_table->ReadFilter(rep->filter_handle, !is_a_filter_partition)); |
|
|
|
|
if (rep->filter.get()) { |
|
|
|
|
rep->filter->SetLevel(level); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} else { |
|
|
|
|
delete index_reader; |
|
|
|
@ -798,7 +839,8 @@ Status BlockBasedTable::GetDataBlockFromCache( |
|
|
|
|
Cache* block_cache, Cache* block_cache_compressed, |
|
|
|
|
const ImmutableCFOptions& ioptions, const ReadOptions& read_options, |
|
|
|
|
BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version, |
|
|
|
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit) { |
|
|
|
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit, |
|
|
|
|
bool is_index) { |
|
|
|
|
Status s; |
|
|
|
|
Block* compressed_block = nullptr; |
|
|
|
|
Cache::Handle* block_cache_compressed_handle = nullptr; |
|
|
|
@ -806,9 +848,10 @@ Status BlockBasedTable::GetDataBlockFromCache( |
|
|
|
|
|
|
|
|
|
// Lookup uncompressed cache first
|
|
|
|
|
if (block_cache != nullptr) { |
|
|
|
|
block->cache_handle = |
|
|
|
|
GetEntryFromCache(block_cache, block_cache_key, BLOCK_CACHE_DATA_MISS, |
|
|
|
|
BLOCK_CACHE_DATA_HIT, statistics); |
|
|
|
|
block->cache_handle = GetEntryFromCache( |
|
|
|
|
block_cache, block_cache_key, |
|
|
|
|
is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, |
|
|
|
|
is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, statistics); |
|
|
|
|
if (block->cache_handle != nullptr) { |
|
|
|
|
block->value = |
|
|
|
|
reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)); |
|
|
|
@ -860,9 +903,15 @@ Status BlockBasedTable::GetDataBlockFromCache( |
|
|
|
|
&DeleteCachedEntry<Block>, &(block->cache_handle)); |
|
|
|
|
if (s.ok()) { |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_ADD); |
|
|
|
|
if (is_index) { |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, |
|
|
|
|
block->value->usable_size()); |
|
|
|
|
} else { |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_ADD); |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, |
|
|
|
|
block->value->usable_size()); |
|
|
|
|
} |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, |
|
|
|
|
block->value->usable_size()); |
|
|
|
|
} else { |
|
|
|
@ -883,7 +932,8 @@ Status BlockBasedTable::PutDataBlockToCache( |
|
|
|
|
Cache* block_cache, Cache* block_cache_compressed, |
|
|
|
|
const ReadOptions& read_options, const ImmutableCFOptions& ioptions, |
|
|
|
|
CachableEntry<Block>* block, Block* raw_block, uint32_t format_version, |
|
|
|
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit) { |
|
|
|
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index, |
|
|
|
|
Cache::Priority priority) { |
|
|
|
|
assert(raw_block->compression_type() == kNoCompression || |
|
|
|
|
block_cache_compressed != nullptr); |
|
|
|
|
|
|
|
|
@ -929,15 +979,21 @@ Status BlockBasedTable::PutDataBlockToCache( |
|
|
|
|
// insert into uncompressed block cache
|
|
|
|
|
assert((block->value->compression_type() == kNoCompression)); |
|
|
|
|
if (block_cache != nullptr && block->value->cachable()) { |
|
|
|
|
s = block_cache->Insert(block_cache_key, block->value, |
|
|
|
|
block->value->usable_size(), |
|
|
|
|
&DeleteCachedEntry<Block>, &(block->cache_handle)); |
|
|
|
|
s = block_cache->Insert( |
|
|
|
|
block_cache_key, block->value, block->value->usable_size(), |
|
|
|
|
&DeleteCachedEntry<Block>, &(block->cache_handle), priority); |
|
|
|
|
if (s.ok()) { |
|
|
|
|
assert(block->cache_handle != nullptr); |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_ADD); |
|
|
|
|
if (is_index) { |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, |
|
|
|
|
block->value->usable_size()); |
|
|
|
|
} else { |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_ADD); |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, |
|
|
|
|
block->value->usable_size()); |
|
|
|
|
} |
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, |
|
|
|
|
block->value->usable_size()); |
|
|
|
|
assert(reinterpret_cast<Block*>( |
|
|
|
@ -952,7 +1008,9 @@ Status BlockBasedTable::PutDataBlockToCache( |
|
|
|
|
return s; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) const { |
|
|
|
|
FilterBlockReader* BlockBasedTable::ReadFilter( |
|
|
|
|
const BlockHandle& filter_handle, const bool is_a_filter_partition) const { |
|
|
|
|
auto& rep = rep_; |
|
|
|
|
// TODO: We might want to unify with ReadBlockFromFile() if we start
|
|
|
|
|
// requiring checksum verification in Table::Open.
|
|
|
|
|
if (rep->filter_type == Rep::FilterType::kNoFilter) { |
|
|
|
@ -960,7 +1018,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) const { |
|
|
|
|
} |
|
|
|
|
BlockContents block; |
|
|
|
|
if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(), |
|
|
|
|
rep->filter_handle, &block, rep->ioptions, |
|
|
|
|
filter_handle, &block, rep->ioptions, |
|
|
|
|
false /* decompress */, Slice() /*compression dict*/, |
|
|
|
|
rep->persistent_cache_options) |
|
|
|
|
.ok()) { |
|
|
|
@ -970,35 +1028,60 @@ FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) const { |
|
|
|
|
|
|
|
|
|
assert(rep->filter_policy); |
|
|
|
|
|
|
|
|
|
if (rep->filter_type == Rep::FilterType::kBlockFilter) { |
|
|
|
|
auto filter_type = rep->filter_type; |
|
|
|
|
if (rep->filter_type == Rep::FilterType::kPartitionedFilter && |
|
|
|
|
is_a_filter_partition) { |
|
|
|
|
filter_type = Rep::FilterType::kFullFilter; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
switch (filter_type) { |
|
|
|
|
case Rep::FilterType::kPartitionedFilter: { |
|
|
|
|
return new PartitionedFilterBlockReader( |
|
|
|
|
rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, |
|
|
|
|
rep->whole_key_filtering, std::move(block), nullptr, |
|
|
|
|
rep->ioptions.statistics, rep->internal_comparator, this); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
case Rep::FilterType::kBlockFilter: |
|
|
|
|
return new BlockBasedFilterBlockReader( |
|
|
|
|
rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, |
|
|
|
|
rep->table_options, rep->whole_key_filtering, std::move(block), |
|
|
|
|
rep->ioptions.statistics); |
|
|
|
|
} else if (rep->filter_type == Rep::FilterType::kFullFilter) { |
|
|
|
|
|
|
|
|
|
case Rep::FilterType::kFullFilter: { |
|
|
|
|
auto filter_bits_reader = |
|
|
|
|
rep->filter_policy->GetFilterBitsReader(block.data); |
|
|
|
|
if (filter_bits_reader != nullptr) { |
|
|
|
|
assert(filter_bits_reader != nullptr); |
|
|
|
|
return new FullFilterBlockReader( |
|
|
|
|
rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, |
|
|
|
|
rep->whole_key_filtering, std::move(block), filter_bits_reader, |
|
|
|
|
rep->ioptions.statistics); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
default: |
|
|
|
|
// filter_type is either kNoFilter (exited the function at the first if),
|
|
|
|
|
// kBlockFilter or kFullFilter. there is no way for the execution to come here
|
|
|
|
|
// or it must be covered in this switch block
|
|
|
|
|
assert(false); |
|
|
|
|
return nullptr; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( |
|
|
|
|
bool no_io) const { |
|
|
|
|
const BlockHandle& filter_blk_handle = rep_->filter_handle; |
|
|
|
|
const bool is_a_filter_partition = true; |
|
|
|
|
return GetFilter(filter_blk_handle, !is_a_filter_partition, no_io); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( |
|
|
|
|
const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, |
|
|
|
|
bool no_io) const { |
|
|
|
|
// If cache_index_and_filter_blocks is false, filter should be pre-populated.
|
|
|
|
|
// We will return rep_->filter anyway. rep_->filter can be nullptr if filter
|
|
|
|
|
// read fails at Open() time. We don't want to reload again since it will
|
|
|
|
|
// most probably fail again.
|
|
|
|
|
if (!rep_->table_options.cache_index_and_filter_blocks) { |
|
|
|
|
if (!is_a_filter_partition && |
|
|
|
|
!rep_->table_options.cache_index_and_filter_blocks) { |
|
|
|
|
return {rep_->filter.get(), nullptr /* cache handle */}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1008,8 +1091,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( |
|
|
|
|
return {nullptr /* filter */, nullptr /* cache handle */}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// we have a pinned filter block
|
|
|
|
|
if (rep_->filter_entry.IsSet()) { |
|
|
|
|
if (!is_a_filter_partition && rep_->filter_entry.IsSet()) { |
|
|
|
|
return rep_->filter_entry; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1018,8 +1100,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( |
|
|
|
|
// Fetching from the cache
|
|
|
|
|
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; |
|
|
|
|
auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, |
|
|
|
|
rep_->footer.metaindex_handle(), |
|
|
|
|
cache_key); |
|
|
|
|
filter_blk_handle, cache_key); |
|
|
|
|
|
|
|
|
|
Statistics* statistics = rep_->ioptions.statistics; |
|
|
|
|
auto cache_handle = |
|
|
|
@ -1034,7 +1115,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( |
|
|
|
|
// Do not invoke any io.
|
|
|
|
|
return CachableEntry<FilterBlockReader>(); |
|
|
|
|
} else { |
|
|
|
|
filter = ReadFilter(rep_); |
|
|
|
|
filter = ReadFilter(filter_blk_handle, is_a_filter_partition); |
|
|
|
|
if (filter != nullptr) { |
|
|
|
|
assert(filter->size() > 0); |
|
|
|
|
Status s = block_cache->Insert( |
|
|
|
@ -1074,7 +1155,7 @@ InternalIterator* BlockBasedTable::NewIndexIterator( |
|
|
|
|
|
|
|
|
|
PERF_TIMER_GUARD(read_index_block_nanos); |
|
|
|
|
|
|
|
|
|
bool no_io = read_options.read_tier == kBlockCacheTier; |
|
|
|
|
const bool no_io = read_options.read_tier == kBlockCacheTier; |
|
|
|
|
Cache* block_cache = rep_->table_options.block_cache.get(); |
|
|
|
|
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; |
|
|
|
|
auto key = |
|
|
|
@ -1153,29 +1234,36 @@ InternalIterator* BlockBasedTable::NewIndexIterator( |
|
|
|
|
return iter; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
InternalIterator* BlockBasedTable::NewDataBlockIterator( |
|
|
|
|
Rep* rep, const ReadOptions& ro, const Slice& index_value, |
|
|
|
|
BlockIter* input_iter, bool is_index) { |
|
|
|
|
BlockHandle handle; |
|
|
|
|
Slice input = index_value; |
|
|
|
|
// We intentionally allow extra stuff in index_value so that we
|
|
|
|
|
// can add more features in the future.
|
|
|
|
|
Status s = handle.DecodeFrom(&input); |
|
|
|
|
return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, s); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Convert an index iterator value (i.e., an encoded BlockHandle)
|
|
|
|
|
// into an iterator over the contents of the corresponding block.
|
|
|
|
|
// If input_iter is null, new a iterator
|
|
|
|
|
// If input_iter is not null, update this iter and return it
|
|
|
|
|
InternalIterator* BlockBasedTable::NewDataBlockIterator( |
|
|
|
|
Rep* rep, const ReadOptions& ro, const Slice& index_value, |
|
|
|
|
BlockIter* input_iter) { |
|
|
|
|
Rep* rep, const ReadOptions& ro, const BlockHandle& handle, |
|
|
|
|
BlockIter* input_iter, bool is_index, Status s) { |
|
|
|
|
PERF_TIMER_GUARD(new_table_block_iter_nanos); |
|
|
|
|
|
|
|
|
|
const bool no_io = (ro.read_tier == kBlockCacheTier); |
|
|
|
|
Cache* block_cache = rep->table_options.block_cache.get(); |
|
|
|
|
CachableEntry<Block> block; |
|
|
|
|
BlockHandle handle; |
|
|
|
|
Slice input = index_value; |
|
|
|
|
// We intentionally allow extra stuff in index_value so that we
|
|
|
|
|
// can add more features in the future.
|
|
|
|
|
Status s = handle.DecodeFrom(&input); |
|
|
|
|
Slice compression_dict; |
|
|
|
|
if (s.ok()) { |
|
|
|
|
if (rep->compression_dict_block) { |
|
|
|
|
compression_dict = rep->compression_dict_block->data; |
|
|
|
|
} |
|
|
|
|
s = MaybeLoadDataBlockToCache(rep, ro, handle, compression_dict, &block); |
|
|
|
|
s = MaybeLoadDataBlockToCache(rep, ro, handle, compression_dict, &block, |
|
|
|
|
is_index); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Didn't get any data from block caches.
|
|
|
|
@ -1224,7 +1312,7 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator( |
|
|
|
|
|
|
|
|
|
Status BlockBasedTable::MaybeLoadDataBlockToCache( |
|
|
|
|
Rep* rep, const ReadOptions& ro, const BlockHandle& handle, |
|
|
|
|
Slice compression_dict, CachableEntry<Block>* block_entry) { |
|
|
|
|
Slice compression_dict, CachableEntry<Block>* block_entry, bool is_index) { |
|
|
|
|
const bool no_io = (ro.read_tier == kBlockCacheTier); |
|
|
|
|
Cache* block_cache = rep->table_options.block_cache.get(); |
|
|
|
|
Cache* block_cache_compressed = |
|
|
|
@ -1254,7 +1342,7 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache( |
|
|
|
|
s = GetDataBlockFromCache( |
|
|
|
|
key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro, |
|
|
|
|
block_entry, rep->table_options.format_version, compression_dict, |
|
|
|
|
rep->table_options.read_amp_bytes_per_bit); |
|
|
|
|
rep->table_options.read_amp_bytes_per_bit, is_index); |
|
|
|
|
|
|
|
|
|
if (block_entry->value == nullptr && !no_io && ro.fill_cache) { |
|
|
|
|
std::unique_ptr<Block> raw_block; |
|
|
|
@ -1271,7 +1359,13 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache( |
|
|
|
|
s = PutDataBlockToCache( |
|
|
|
|
key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions, |
|
|
|
|
block_entry, raw_block.release(), rep->table_options.format_version, |
|
|
|
|
compression_dict, rep->table_options.read_amp_bytes_per_bit); |
|
|
|
|
compression_dict, rep->table_options.read_amp_bytes_per_bit, |
|
|
|
|
is_index, |
|
|
|
|
is_index && |
|
|
|
|
rep->table_options |
|
|
|
|
.cache_index_and_filter_blocks_with_high_priority |
|
|
|
|
? Cache::Priority::HIGH |
|
|
|
|
: Cache::Priority::LOW); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -1279,17 +1373,39 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache( |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState( |
|
|
|
|
BlockBasedTable* table, const ReadOptions& read_options, bool skip_filters) |
|
|
|
|
BlockBasedTable* table, const ReadOptions& read_options, bool skip_filters, |
|
|
|
|
bool is_index, Cleanable* block_cache_cleaner) |
|
|
|
|
: TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr), |
|
|
|
|
table_(table), |
|
|
|
|
read_options_(read_options), |
|
|
|
|
skip_filters_(skip_filters) {} |
|
|
|
|
skip_filters_(skip_filters), |
|
|
|
|
is_index_(is_index), |
|
|
|
|
block_cache_cleaner_(block_cache_cleaner) {} |
|
|
|
|
|
|
|
|
|
InternalIterator* |
|
|
|
|
BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator( |
|
|
|
|
const Slice& index_value) { |
|
|
|
|
// Return a block iterator on the index partition
|
|
|
|
|
return NewDataBlockIterator(table_->rep_, read_options_, index_value); |
|
|
|
|
BlockHandle handle; |
|
|
|
|
Slice input = index_value; |
|
|
|
|
Status s = handle.DecodeFrom(&input); |
|
|
|
|
auto iter = NewDataBlockIterator(table_->rep_, read_options_, handle, nullptr, |
|
|
|
|
is_index_, s); |
|
|
|
|
if (block_cache_cleaner_) { |
|
|
|
|
uint64_t offset = handle.offset(); |
|
|
|
|
{ |
|
|
|
|
ReadLock rl(&cleaner_mu); |
|
|
|
|
if (cleaner_set.find(offset) != cleaner_set.end()) { |
|
|
|
|
// already have a refernce to the block cache objects
|
|
|
|
|
return iter; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
WriteLock wl(&cleaner_mu); |
|
|
|
|
cleaner_set.insert(offset); |
|
|
|
|
// Keep the data into cache until the cleaner cleansup
|
|
|
|
|
iter->DelegateCleanupsTo(block_cache_cleaner_); |
|
|
|
|
} |
|
|
|
|
return iter; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bool BlockBasedTable::BlockEntryIteratorState::PrefixMayMatch( |
|
|
|
@ -1325,25 +1441,29 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
auto prefix = rep_->ioptions.prefix_extractor->Transform(user_key); |
|
|
|
|
InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); |
|
|
|
|
auto internal_prefix = internal_key_prefix.Encode(); |
|
|
|
|
|
|
|
|
|
bool may_match = true; |
|
|
|
|
Status s; |
|
|
|
|
|
|
|
|
|
// First, try check with full filter
|
|
|
|
|
const bool no_io = true; |
|
|
|
|
auto filter_entry = GetFilter(no_io); |
|
|
|
|
FilterBlockReader* filter = filter_entry.value; |
|
|
|
|
if (filter != nullptr) { |
|
|
|
|
if (!filter->IsBlockBased()) { |
|
|
|
|
const Slice* const const_ikey_ptr = &internal_key; |
|
|
|
|
may_match = |
|
|
|
|
filter->PrefixMayMatch(prefix, kNotValid, no_io, const_ikey_ptr); |
|
|
|
|
} else { |
|
|
|
|
InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); |
|
|
|
|
auto internal_prefix = internal_key_prefix.Encode(); |
|
|
|
|
|
|
|
|
|
// To prevent any io operation in this method, we set `read_tier` to make
|
|
|
|
|
// sure we always read index or filter only when they have already been
|
|
|
|
|
// loaded to memory.
|
|
|
|
|
ReadOptions no_io_read_options; |
|
|
|
|
no_io_read_options.read_tier = kBlockCacheTier; |
|
|
|
|
|
|
|
|
|
// First, try check with full filter
|
|
|
|
|
auto filter_entry = GetFilter(true /* no io */); |
|
|
|
|
FilterBlockReader* filter = filter_entry.value; |
|
|
|
|
if (filter != nullptr) { |
|
|
|
|
if (!filter->IsBlockBased()) { |
|
|
|
|
may_match = filter->PrefixMayMatch(prefix); |
|
|
|
|
} else { |
|
|
|
|
// Then, try find it within each block
|
|
|
|
|
unique_ptr<InternalIterator> iiter(NewIndexIterator(no_io_read_options)); |
|
|
|
|
iiter->Seek(internal_prefix); |
|
|
|
@ -1438,20 +1558,23 @@ InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( |
|
|
|
|
|
|
|
|
|
bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options, |
|
|
|
|
FilterBlockReader* filter, |
|
|
|
|
const Slice& internal_key) const { |
|
|
|
|
const Slice& internal_key, |
|
|
|
|
const bool no_io) const { |
|
|
|
|
if (filter == nullptr || filter->IsBlockBased()) { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
Slice user_key = ExtractUserKey(internal_key); |
|
|
|
|
const Slice* const const_ikey_ptr = &internal_key; |
|
|
|
|
if (filter->whole_key_filtering()) { |
|
|
|
|
return filter->KeyMayMatch(user_key); |
|
|
|
|
return filter->KeyMayMatch(user_key, kNotValid, no_io, const_ikey_ptr); |
|
|
|
|
} |
|
|
|
|
if (!read_options.total_order_seek && rep_->ioptions.prefix_extractor && |
|
|
|
|
rep_->table_properties->prefix_extractor_name.compare( |
|
|
|
|
rep_->ioptions.prefix_extractor->Name()) == 0 && |
|
|
|
|
rep_->ioptions.prefix_extractor->InDomain(user_key) && |
|
|
|
|
!filter->PrefixMayMatch( |
|
|
|
|
rep_->ioptions.prefix_extractor->Transform(user_key))) { |
|
|
|
|
rep_->ioptions.prefix_extractor->Transform(user_key), kNotValid, |
|
|
|
|
false, const_ikey_ptr)) { |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
return true; |
|
|
|
@ -1460,6 +1583,7 @@ bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options, |
|
|
|
|
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, |
|
|
|
|
GetContext* get_context, bool skip_filters) { |
|
|
|
|
Status s; |
|
|
|
|
const bool no_io = read_options.read_tier == kBlockCacheTier; |
|
|
|
|
CachableEntry<FilterBlockReader> filter_entry; |
|
|
|
|
if (!skip_filters) { |
|
|
|
|
filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); |
|
|
|
@ -1468,14 +1592,14 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, |
|
|
|
|
|
|
|
|
|
// First check the full filter
|
|
|
|
|
// If full filter not useful, Then go into each block
|
|
|
|
|
if (!FullFilterKeyMayMatch(read_options, filter, key)) { |
|
|
|
|
if (!FullFilterKeyMayMatch(read_options, filter, key, no_io)) { |
|
|
|
|
RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); |
|
|
|
|
} else { |
|
|
|
|
BlockIter iiter_on_stack; |
|
|
|
|
auto iiter = NewIndexIterator(read_options, &iiter_on_stack); |
|
|
|
|
std::unique_ptr<InternalIterator> iiter_unique_ptr; |
|
|
|
|
if (iiter != &iiter_on_stack) { |
|
|
|
|
iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter); |
|
|
|
|
iiter_unique_ptr.reset(iiter); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bool done = false; |
|
|
|
@ -1486,7 +1610,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, |
|
|
|
|
bool not_exist_in_filter = |
|
|
|
|
filter != nullptr && filter->IsBlockBased() == true && |
|
|
|
|
handle.DecodeFrom(&handle_value).ok() && |
|
|
|
|
!filter->KeyMayMatch(ExtractUserKey(key), handle.offset()); |
|
|
|
|
!filter->KeyMayMatch(ExtractUserKey(key), handle.offset(), no_io); |
|
|
|
|
|
|
|
|
|
if (not_exist_in_filter) { |
|
|
|
|
// Not found
|
|
|
|
@ -1525,6 +1649,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, |
|
|
|
|
} |
|
|
|
|
s = biter.status(); |
|
|
|
|
} |
|
|
|
|
if (done) { |
|
|
|
|
// Avoid the extra Next which is expensive in two-level indexes
|
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (s.ok()) { |
|
|
|
|
s = iiter->status(); |
|
|
|
@ -1631,7 +1759,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, |
|
|
|
|
// 4. internal_comparator
|
|
|
|
|
// 5. index_type
|
|
|
|
|
Status BlockBasedTable::CreateIndexReader( |
|
|
|
|
IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter) { |
|
|
|
|
IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter, |
|
|
|
|
int level) { |
|
|
|
|
// Some old version of block-based tables don't have index type present in
|
|
|
|
|
// table properties. If that's the case we can safely use the kBinarySearch.
|
|
|
|
|
auto index_type_on_file = BlockBasedTableOptions::kBinarySearch; |
|
|
|
@ -1660,7 +1789,7 @@ Status BlockBasedTable::CreateIndexReader( |
|
|
|
|
case BlockBasedTableOptions::kTwoLevelIndexSearch: { |
|
|
|
|
return PartitionIndexReader::Create( |
|
|
|
|
this, file, footer, footer.index_handle(), rep_->ioptions, comparator, |
|
|
|
|
index_reader, rep_->persistent_cache_options); |
|
|
|
|
index_reader, rep_->persistent_cache_options, level); |
|
|
|
|
} |
|
|
|
|
case BlockBasedTableOptions::kBinarySearch: { |
|
|
|
|
return BinarySearchIndexReader::Create( |
|
|
|
|