Clarify caching behavior for index and filter partitions (#9068)

Summary:
Somewhat confusingly, index and filter partition blocks are
never owned by table readers, even with
cache_index_and_filter_blocks=false. They still go into block cache
(possibly pinned by table reader) if there is a block cache. If no block
cache, they are only loaded transiently on demand.

This PR primarily clarifies the options APIs and some internal code
comments.

Also, this closes a hypothetical data corruption vulnerability where
some but not all index partitions are pinned. I haven't been able to
reproduce a case where it can happen (the failure seems to propagate
to abort table open) but it's worth patching nonetheless.

Fixes https://github.com/facebook/rocksdb/issues/8979

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9068

Test Plan:
existing tests :-/  I could cover the new code using sync
points, but then I'd have to very carefully relax my `assert(false)`

Reviewed By: ajkr

Differential Revision: D31898284

Pulled By: pdillinger

fbshipit-source-id: f2511a7d3a36bc04b627935d8e6cfea6422f98be
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent 82846f41d3
commit 5bf9a7d5ee
  1. 12
      include/rocksdb/table.h
  2. 13
      table/block_based/block_based_table_reader.cc
  3. 2
      table/block_based/partitioned_filter_block.h
  4. 25
      table/block_based/partitioned_index_reader.cc
  5. 3
      table/block_based/partitioned_index_reader.h
  6. 4
      table/two_level_iterator.cc

@ -117,9 +117,10 @@ struct BlockBasedTableOptions {
// caching as they should now apply to range tombstone and compression
// dictionary meta-blocks, in addition to index and filter meta-blocks.
//
// Indicating if we'd put index/filter blocks to the block cache.
// If not specified, each "table reader" object will pre-load index/filter
// block during table initialization.
// Whether to put index/filter blocks in the block cache. When false,
// each "table reader" object will pre-load index/filter blocks during
// table initialization. Index and filter partition blocks always use
// block cache regardless of this option.
bool cache_index_and_filter_blocks = false;
// If cache_index_and_filter_blocks is enabled, cache index and filter
@ -190,6 +191,8 @@ struct BlockBasedTableOptions {
kHashSearch = 0x01,
// A two-level index implementation. Both levels are binary search indexes.
// Second level index blocks ("partitions") use block cache even when
// cache_index_and_filter_blocks=false.
kTwoLevelIndexSearch = 0x02,
// Like kBinarySearch, but index also contains first key of each block.
@ -285,7 +288,8 @@ struct BlockBasedTableOptions {
// well.
// TODO(myabandeh): remove the note above once the limitation is lifted
// Use partitioned full filters for each SST file. This option is
// incompatible with block-based filters.
// incompatible with block-based filters. Filter partition blocks use
// block cache even when cache_index_and_filter_blocks=false.
bool partition_filters = false;
// Option to generate Bloom/Ribbon filters that minimize memory

@ -2063,9 +2063,12 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
const BlockHandle& handle) {
// Return a block iterator on the index partition
auto block = block_map_->find(handle.offset());
// This is a possible scenario since block cache might not have had space
// for the partition
if (block != block_map_->end()) {
// block_map_ must be exhaustive
if (block == block_map_->end()) {
assert(false);
// Signal problem to caller
return nullptr;
}
const Rep* rep = table_->get_rep();
assert(rep);
@ -2078,10 +2081,6 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
rep->index_has_first_key, rep->index_key_includes_seq,
rep->index_value_is_full);
}
// Create an empty iterator
// TODO(ajkr): this is not the right way to handle an unpinned partition.
return new IndexBlockIter();
}
// This will be broken if the user specifies an unusual implementation
// of Options.comparator, or if the user specifies an unusual

@ -142,6 +142,8 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
bool index_value_is_full() const;
protected:
// For partition blocks pinned in cache. Can be a subset of blocks
// in case some fail insertion on attempt to pin.
std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
filter_map_;
};

@ -114,11 +114,13 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
Statistics* kNullStats = nullptr;
CachableEntry<Block> index_block;
{
Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
&lookup_context, &index_block);
if (!s.ok()) {
return s;
}
}
// We don't return pinned data from index blocks, so no need
// to set `block_contents_pinned`.
@ -149,7 +151,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
false /*Implicit auto readahead*/);
IOOptions opts;
s = rep->file->PrepareIOOptions(ro, opts);
{
Status s = rep->file->PrepareIOOptions(ro, opts);
if (s.ok()) {
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
static_cast<size_t>(prefetch_len));
@ -157,15 +160,21 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
if (!s.ok()) {
return s;
}
}
// For saving "all or nothing" to partition_map_
std::unordered_map<uint64_t, CachableEntry<Block>> map_in_progress;
// After prefetch, read the partitions one by one
biter.SeekToFirst();
size_t partition_count = 0;
for (; biter.Valid(); biter.Next()) {
handle = biter.value().handle;
CachableEntry<Block> block;
++partition_count;
// TODO: Support counter batch update for partitioned index and
// filter blocks
s = table()->MaybeReadBlockAndLoadToCache(
Status s = table()->MaybeReadBlockAndLoadToCache(
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
/*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
&lookup_context, /*contents=*/nullptr);
@ -174,14 +183,22 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
return s;
}
if (block.GetValue() != nullptr) {
// Might need to "pin" some mmap-read blocks (GetOwnValue) if some
// partitions are successfully compressed (cached) and some are not
// compressed (mmap eligible)
if (block.IsCached() || block.GetOwnValue()) {
if (pin) {
partition_map_[handle.offset()] = std::move(block);
map_in_progress[handle.offset()] = std::move(block);
}
}
}
}
return biter.status();
Status s = biter.status();
// Save (pin) them only if everything checks out
if (map_in_progress.size() == partition_count && s.ok()) {
std::swap(partition_map_, map_in_progress);
}
return s;
}
} // namespace ROCKSDB_NAMESPACE

@ -46,6 +46,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
CachableEntry<Block>&& index_block)
: IndexReaderCommon(t, std::move(index_block)) {}
// For partition blocks pinned in cache. This is expected to be "all or
// none" so that !partition_map_.empty() can use an iterator expecting
// all partitions to be saved here.
std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
};
} // namespace ROCKSDB_NAMESPACE

@ -201,6 +201,10 @@ void TwoLevelIndexIterator::InitDataBlock() {
state_->NewSecondaryIterator(handle);
data_block_handle_ = handle;
SetSecondLevelIterator(iter);
if (iter == nullptr) {
status_ = Status::Corruption("Missing block for partition " +
handle.ToString());
}
}
}
}

Loading…
Cancel
Save