Extend pin_l0 to filter partitions

Summary:
This is the continuation of https://github.com/facebook/rocksdb/pull/2661 for filter partitions. When pin_l0 is set (along with cache_xxx), then open table open the filter partitions are loaded into the cache and pinned there.
Closes https://github.com/facebook/rocksdb/pull/2766

Differential Revision: D5671098

Pulled By: maysamyabandeh

fbshipit-source-id: 174f24018f1d7f1129621e7380287b65b67d2115
main
Maysam Yabandeh 7 years ago committed by Facebook Github Bot
parent 39ef900551
commit 1dfcdb15f9
  1. 68
      table/block_based_table_reader.cc
  2. 15
      table/block_based_table_reader.h
  3. 5
      table/filter_block.h
  4. 130
      table/partitioned_filter_block.cc
  5. 13
      table/partitioned_filter_block.h
  6. 9
      table/partitioned_filter_block_test.cc

@ -291,7 +291,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
} }
BlockBasedTable* table_; BlockBasedTable* table_;
std::unique_ptr<Block> index_block_; std::unique_ptr<Block> index_block_;
std::map<uint64_t, BlockBasedTable::CachableEntry<Block>> partition_map_; std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
partition_map_;
}; };
// Index that allows binary search lookup for the first key of each block. // Index that allows binary search lookup for the first key of each block.
@ -797,14 +798,13 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
rep->ioptions.info_log); rep->ioptions.info_log);
} }
// pre-fetching of blocks is turned on const bool pin =
rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
// pre-fetching of blocks is turned on
// Will use block cache for index/filter blocks access // Will use block cache for index/filter blocks access
// Always prefetch index and filter for level 0 // Always prefetch index and filter for level 0
if (table_options.cache_index_and_filter_blocks) { if (table_options.cache_index_and_filter_blocks) {
if (prefetch_index_and_filter_in_cache || level == 0) { if (prefetch_index_and_filter_in_cache || level == 0) {
const bool pin =
rep->table_options.pin_l0_filter_and_index_blocks_in_cache &&
level == 0;
assert(table_options.block_cache != nullptr); assert(table_options.block_cache != nullptr);
// Hack: Call NewIndexIterator() to implicitly add index to the // Hack: Call NewIndexIterator() to implicitly add index to the
// block_cache // block_cache
@ -823,15 +823,15 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
if (s.ok()) { if (s.ok()) {
// Hack: Call GetFilter() to implicitly add filter to the block_cache // Hack: Call GetFilter() to implicitly add filter to the block_cache
auto filter_entry = new_table->GetFilter(); auto filter_entry = new_table->GetFilter();
if (filter_entry.value != nullptr) {
filter_entry.value->CacheDependencies(pin);
}
// if pin_l0_filter_and_index_blocks_in_cache is true, and this is // if pin_l0_filter_and_index_blocks_in_cache is true, and this is
// a level0 file, then save it in rep_->filter_entry; it will be // a level0 file, then save it in rep_->filter_entry; it will be
// released in the destructor only, hence it will be pinned in the // released in the destructor only, hence it will be pinned in the
// cache while this reader is alive // cache while this reader is alive
if (pin) { if (pin) {
rep->filter_entry = filter_entry; rep->filter_entry = filter_entry;
if (rep->filter_entry.value != nullptr) {
rep->filter_entry.value->SetLevel(level);
}
} else { } else {
filter_entry.Release(table_options.block_cache.get()); filter_entry.Release(table_options.block_cache.get());
} }
@ -844,17 +844,25 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
IndexReader* index_reader = nullptr; IndexReader* index_reader = nullptr;
s = new_table->CreateIndexReader(prefetch_buffer.get(), &index_reader, s = new_table->CreateIndexReader(prefetch_buffer.get(), &index_reader,
meta_iter.get(), level); meta_iter.get(), level);
if (s.ok()) { if (s.ok()) {
rep->index_reader.reset(index_reader); rep->index_reader.reset(index_reader);
// The partitions of partitioned index are always stored in cache. They
// are hence follow the configuration for pin and prefetch regardless of
// the value of cache_index_and_filter_blocks
if (prefetch_index_and_filter_in_cache || level == 0) {
rep->index_reader->CacheDependencies(pin);
}
// Set filter block // Set filter block
if (rep->filter_policy) { if (rep->filter_policy) {
const bool is_a_filter_partition = true; const bool is_a_filter_partition = true;
rep->filter.reset(new_table->ReadFilter( auto filter = new_table->ReadFilter(
prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition)); prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition);
if (rep->filter.get()) { rep->filter.reset(filter);
rep->filter->SetLevel(level); // Refer to the comment above about paritioned indexes always being
// cached
if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
filter->CacheDependencies(pin);
} }
} }
} else { } else {
@ -1171,15 +1179,16 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
} }
BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
bool no_io) const { FilePrefetchBuffer* prefetch_buffer, bool no_io) const {
const BlockHandle& filter_blk_handle = rep_->filter_handle; const BlockHandle& filter_blk_handle = rep_->filter_handle;
const bool is_a_filter_partition = true; const bool is_a_filter_partition = true;
return GetFilter(filter_blk_handle, !is_a_filter_partition, no_io); return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
no_io);
} }
BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
bool no_io) const { const bool is_a_filter_partition, bool no_io) const {
// If cache_index_and_filter_blocks is false, filter should be pre-populated. // If cache_index_and_filter_blocks is false, filter should be pre-populated.
// We will return rep_->filter anyway. rep_->filter can be nullptr if filter // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
// read fails at Open() time. We don't want to reload again since it will // read fails at Open() time. We don't want to reload again since it will
@ -1219,8 +1228,8 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
// Do not invoke any io. // Do not invoke any io.
return CachableEntry<FilterBlockReader>(); return CachableEntry<FilterBlockReader>();
} else { } else {
filter = ReadFilter(nullptr /* prefetch_buffer */, filter_blk_handle, filter =
is_a_filter_partition); ReadFilter(prefetch_buffer, filter_blk_handle, is_a_filter_partition);
if (filter != nullptr) { if (filter != nullptr) {
assert(filter->size() > 0); assert(filter->size() > 0);
Status s = block_cache->Insert( Status s = block_cache->Insert(
@ -1482,7 +1491,7 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState( BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState(
BlockBasedTable* table, const ReadOptions& read_options, BlockBasedTable* table, const ReadOptions& read_options,
const InternalKeyComparator* icomparator, bool skip_filters, bool is_index, const InternalKeyComparator* icomparator, bool skip_filters, bool is_index,
std::map<uint64_t, CachableEntry<Block>>* block_map) std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
: TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr), : TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr),
table_(table), table_(table),
read_options_(read_options), read_options_(read_options),
@ -1501,9 +1510,19 @@ BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator(
auto rep = table_->rep_; auto rep = table_->rep_;
if (block_map_) { if (block_map_) {
auto block = block_map_->find(handle.offset()); auto block = block_map_->find(handle.offset());
assert(block != block_map_->end()); // This is a possible scenario since block cache might not have had space
return block->second.value->NewIterator(&rep->internal_comparator, nullptr, // for the partition
true, rep->ioptions.statistics); if (block != block_map_->end()) {
PERF_COUNTER_ADD(block_cache_hit_count, 1);
RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
Cache* block_cache = rep->table_options.block_cache.get();
assert(block_cache);
RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(block->second.cache_handle));
return block->second.value->NewIterator(
&rep->internal_comparator, nullptr, true, rep->ioptions.statistics);
}
} }
return NewDataBlockIterator(rep, read_options_, handle, nullptr, is_index_, return NewDataBlockIterator(rep, read_options_, handle, nullptr, is_index_,
s); s);
@ -1700,7 +1719,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
const bool no_io = read_options.read_tier == kBlockCacheTier; const bool no_io = read_options.read_tier == kBlockCacheTier;
CachableEntry<FilterBlockReader> filter_entry; CachableEntry<FilterBlockReader> filter_entry;
if (!skip_filters) { if (!skip_filters) {
filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); filter_entry = GetFilter(/*prefetch_buffer*/ nullptr,
read_options.read_tier == kBlockCacheTier);
} }
FilterBlockReader* filter = filter_entry.value; FilterBlockReader* filter = filter_entry.value;

@ -183,6 +183,9 @@ class BlockBasedTable : public TableReader {
virtual void CacheDependencies(bool /* unused */) {} virtual void CacheDependencies(bool /* unused */) {}
// Prefetch all the blocks referenced by this index to the buffer
void PrefetchBlocks(FilePrefetchBuffer* buf);
protected: protected:
const InternalKeyComparator* icomparator_; const InternalKeyComparator* icomparator_;
@ -210,6 +213,7 @@ class BlockBasedTable : public TableReader {
explicit BlockBasedTable(Rep* rep) : rep_(rep) {} explicit BlockBasedTable(Rep* rep) : rep_(rep) {}
private: private:
friend class MockedBlockBasedTable;
// input_iter: if it is not null, update this one and return it as Iterator // input_iter: if it is not null, update this one and return it as Iterator
static InternalIterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro, static InternalIterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
const Slice& index_value, const Slice& index_value,
@ -239,10 +243,11 @@ class BlockBasedTable : public TableReader {
// For the following two functions: // For the following two functions:
// if `no_io == true`, we will not try to read filter/index from sst file // if `no_io == true`, we will not try to read filter/index from sst file
// were they not present in cache yet. // were they not present in cache yet.
CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const; CachableEntry<FilterBlockReader> GetFilter(
FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false) const;
virtual CachableEntry<FilterBlockReader> GetFilter( virtual CachableEntry<FilterBlockReader> GetFilter(
const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
bool no_io) const; const bool is_a_filter_partition, bool no_io) const;
// Get the iterator from the index reader. // Get the iterator from the index reader.
// If input_iter is not set, return new Iterator // If input_iter is not set, return new Iterator
@ -352,7 +357,7 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
BlockBasedTable* table, const ReadOptions& read_options, BlockBasedTable* table, const ReadOptions& read_options,
const InternalKeyComparator* icomparator, bool skip_filters, const InternalKeyComparator* icomparator, bool skip_filters,
bool is_index = false, bool is_index = false,
std::map<uint64_t, CachableEntry<Block>>* block_map = nullptr); std::unordered_map<uint64_t, CachableEntry<Block>>* block_map = nullptr);
InternalIterator* NewSecondaryIterator(const Slice& index_value) override; InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
bool PrefixMayMatch(const Slice& internal_key) override; bool PrefixMayMatch(const Slice& internal_key) override;
bool KeyReachedUpperBound(const Slice& internal_key) override; bool KeyReachedUpperBound(const Slice& internal_key) override;
@ -365,7 +370,7 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
bool skip_filters_; bool skip_filters_;
// true if the 2nd level iterator is on indexes instead of on user data. // true if the 2nd level iterator is on indexes instead of on user data.
bool is_index_; bool is_index_;
std::map<uint64_t, CachableEntry<Block>>* block_map_; std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
port::RWMutex cleaner_mu; port::RWMutex cleaner_mu;
}; };

@ -108,15 +108,14 @@ class FilterBlockReader {
bool whole_key_filtering() const { return whole_key_filtering_; } bool whole_key_filtering() const { return whole_key_filtering_; }
int GetLevel() const { return level_; }
void SetLevel(int level) { level_ = level; }
// convert this object to a human readable form // convert this object to a human readable form
virtual std::string ToString() const { virtual std::string ToString() const {
std::string error_msg("Unsupported filter \n"); std::string error_msg("Unsupported filter \n");
return error_msg; return error_msg;
} }
virtual void CacheDependencies(bool pin) {}
protected: protected:
bool whole_key_filtering_; bool whole_key_filtering_;

@ -7,6 +7,7 @@
#include <utility> #include <utility>
#include "monitoring/perf_context_imp.h"
#include "port/port.h" #include "port/port.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "table/block.h" #include "table/block.h"
@ -100,19 +101,29 @@ PartitionedFilterBlockReader::PartitionedFilterBlockReader(
} }
PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
{ // TODO(myabandeh): if instead of filter object we store only the blocks in
ReadLock rl(&mu_); // block cache, then we don't have to manually earse them from block cache
for (auto it = handle_list_.begin(); it != handle_list_.end(); ++it) { // here.
table_->rep_->table_options.block_cache.get()->Release(*it); auto block_cache = table_->rep_->table_options.block_cache.get();
} if (UNLIKELY(block_cache == nullptr)) {
return;
} }
char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
for (auto it = filter_block_set_.begin(); it != filter_block_set_.end(); BlockIter biter;
++it) { BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
biter.SeekToFirst();
for (; biter.Valid(); biter.Next()) {
auto input = biter.value();
auto s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
continue;
}
auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
table_->rep_->cache_key_prefix_size, table_->rep_->cache_key_prefix_size,
*it, cache_key); handle, cache_key);
table_->rep_->table_options.block_cache.get()->Erase(key); block_cache->Erase(key);
} }
} }
@ -205,34 +216,22 @@ PartitionedFilterBlockReader::GetFilterPartition(
const bool is_a_filter_partition = true; const bool is_a_filter_partition = true;
auto block_cache = table_->rep_->table_options.block_cache.get(); auto block_cache = table_->rep_->table_options.block_cache.get();
if (LIKELY(block_cache != nullptr)) { if (LIKELY(block_cache != nullptr)) {
bool pin_cached_filters = if (filter_map_.size() != 0) {
GetLevel() == 0 && auto iter = filter_map_.find(fltr_blk_handle.offset());
table_->rep_->table_options.pin_l0_filter_and_index_blocks_in_cache; // This is a possible scenario since block cache might not have had space
if (pin_cached_filters) { // for the partition
ReadLock rl(&mu_); if (iter != filter_map_.end()) {
auto iter = filter_cache_.find(fltr_blk_handle.offset()); PERF_COUNTER_ADD(block_cache_hit_count, 1);
if (iter != filter_cache_.end()) {
RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT); RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT);
RecordTick(statistics(), BLOCK_CACHE_HIT);
RecordTick(statistics(), BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(iter->second.cache_handle));
*cached = true; *cached = true;
return {iter->second, nullptr}; return iter->second;
}
}
auto filter =
table_->GetFilter(fltr_blk_handle, is_a_filter_partition, no_io);
if (filter.IsSet()) {
WriteLock wl(&mu_);
filter_block_set_.insert(fltr_blk_handle);
if (pin_cached_filters) {
std::pair<uint64_t, FilterBlockReader*> pair(fltr_blk_handle.offset(),
filter.value);
auto succ = filter_cache_.insert(pair).second;
if (succ) {
handle_list_.push_back(filter.cache_handle);
} // Otherwise it is already inserted by a concurrent thread
*cached = true;
} }
} }
return filter; return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
is_a_filter_partition, no_io);
} else { } else {
auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
is_a_filter_partition); is_a_filter_partition);
@ -244,4 +243,69 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
return idx_on_fltr_blk_->size(); return idx_on_fltr_blk_->size();
} }
// TODO(myabandeh): merge this with the same function in IndexReader
void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
// Before read partitions, prefetch them to avoid lots of IOs
auto rep = table_->rep_;
BlockIter biter;
BlockHandle handle;
idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
// Index partitions are assumed to be consecuitive. Prefetch them all.
// Read the first block offset
biter.SeekToFirst();
Slice input = biter.value();
Status s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read first index partition");
return;
}
uint64_t prefetch_off = handle.offset();
// Read the last block's offset
biter.SeekToLast();
input = biter.value();
s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log,
"Could not read last index partition");
return;
}
uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
uint64_t prefetch_len = last_off - prefetch_off;
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
auto& file = table_->rep_->file;
prefetch_buffer.reset(new FilePrefetchBuffer());
s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
// After prefetch, read the partitions one by one
biter.SeekToFirst();
Cache* block_cache = rep->table_options.block_cache.get();
for (; biter.Valid(); biter.Next()) {
input = biter.value();
s = handle.DecodeFrom(&input);
assert(s.ok());
if (!s.ok()) {
ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition");
continue;
}
const bool no_io = true;
const bool is_a_filter_partition = true;
auto filter = table_->GetFilter(prefetch_buffer.get(), handle,
is_a_filter_partition, !no_io);
if (LIKELY(filter.IsSet())) {
if (pin) {
filter_map_[handle.offset()] = std::move(filter);
} else {
block_cache->Release(filter.cache_handle);
}
} else {
delete filter.value;
}
}
}
} // namespace rocksdb } // namespace rocksdb

@ -88,20 +88,15 @@ class PartitionedFilterBlockReader : public FilterBlockReader {
BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition( BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io, FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io,
bool* cached); bool* cached);
virtual void CacheDependencies(bool pin) override;
const SliceTransform* prefix_extractor_; const SliceTransform* prefix_extractor_;
std::unique_ptr<Block> idx_on_fltr_blk_; std::unique_ptr<Block> idx_on_fltr_blk_;
const Comparator& comparator_; const Comparator& comparator_;
const BlockBasedTable* table_; const BlockBasedTable* table_;
std::unordered_map<uint64_t, FilterBlockReader*> filter_cache_; std::unordered_map<uint64_t,
autovector<Cache::Handle*> handle_list_; BlockBasedTable::CachableEntry<FilterBlockReader>>
struct BlockHandleCmp { filter_map_;
bool operator()(const BlockHandle& lhs, const BlockHandle& rhs) const {
return lhs.offset() < rhs.offset();
}
};
std::set<BlockHandle, BlockHandleCmp> filter_block_set_;
port::RWMutex mu_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -22,11 +22,14 @@ std::map<uint64_t, Slice> slices;
class MockedBlockBasedTable : public BlockBasedTable { class MockedBlockBasedTable : public BlockBasedTable {
public: public:
explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) {} explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) {
// Initialize what Open normally does as much as necessary for the test
rep->cache_key_prefix_size = 10;
}
virtual CachableEntry<FilterBlockReader> GetFilter( virtual CachableEntry<FilterBlockReader> GetFilter(
const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
bool no_io) const override { const bool /* unused */, bool /* unused */) const override {
Slice slice = slices[filter_blk_handle.offset()]; Slice slice = slices[filter_blk_handle.offset()];
auto obj = new FullFilterBlockReader( auto obj = new FullFilterBlockReader(
nullptr, true, BlockContents(slice, false, kNoCompression), nullptr, true, BlockContents(slice, false, kNoCompression),

Loading…
Cancel
Save