From 5bf9a7d5ee367b542626f9e58041886e25d1650b Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Wed, 27 Oct 2021 17:21:48 -0700 Subject: [PATCH] Clarify caching behavior for index and filter partitions (#9068) Summary: Somewhat confusingly, index and filter partition blocks are never owned by table readers, even with cache_index_and_filter_blocks=false. They still go into block cache (possibly pinned by table reader) if there is a block cache. If no block cache, they are only loaded transiently on demand. This PR primarily clarifies the options APIs and some internal code comments. Also, this closes a hypothetical data corruption vulnerability where some but not all index partitions are pinned. I haven't been able to reproduce a case where it can happen (the failure seems to propagate to abort table open) but it's worth patching nonetheless. Fixes https://github.com/facebook/rocksdb/issues/8979 Pull Request resolved: https://github.com/facebook/rocksdb/pull/9068 Test Plan: existing tests :-/ I could cover the new code using sync points, but then I'd have to very carefully relax my `assert(false)` Reviewed By: ajkr Differential Revision: D31898284 Pulled By: pdillinger fbshipit-source-id: f2511a7d3a36bc04b627935d8e6cfea6422f98be --- include/rocksdb/table.h | 12 +++-- table/block_based/block_based_table_reader.cc | 35 +++++++-------- table/block_based/partitioned_filter_block.h | 2 + table/block_based/partitioned_index_reader.cc | 45 +++++++++++++------ table/block_based/partitioned_index_reader.h | 3 ++ table/two_level_iterator.cc | 4 ++ 6 files changed, 65 insertions(+), 36 deletions(-) diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 471102359..0090b7e15 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -117,9 +117,10 @@ struct BlockBasedTableOptions { // caching as they should now apply to range tombstone and compression // dictionary meta-blocks, in addition to index and filter meta-blocks. // - // Indicating if we'd put index/filter blocks to the block cache. - // If not specified, each "table reader" object will pre-load index/filter - // block during table initialization. + // Whether to put index/filter blocks in the block cache. When false, + // each "table reader" object will pre-load index/filter blocks during + // table initialization. Index and filter partition blocks always use + // block cache regardless of this option. bool cache_index_and_filter_blocks = false; // If cache_index_and_filter_blocks is enabled, cache index and filter @@ -190,6 +191,8 @@ struct BlockBasedTableOptions { kHashSearch = 0x01, // A two-level index implementation. Both levels are binary search indexes. + // Second level index blocks ("partitions") use block cache even when + // cache_index_and_filter_blocks=false. kTwoLevelIndexSearch = 0x02, // Like kBinarySearch, but index also contains first key of each block. @@ -285,7 +288,8 @@ struct BlockBasedTableOptions { // well. // TODO(myabandeh): remove the note above once the limitation is lifted // Use partitioned full filters for each SST file. This option is - // incompatible with block-based filters. + // incompatible with block-based filters. Filter partition blocks use + // block cache even when cache_index_and_filter_blocks=false. bool partition_filters = false; // Option to generate Bloom/Ribbon filters that minimize memory diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index ad8ee53f9..96d8895e2 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2063,24 +2063,23 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( const BlockHandle& handle) { // Return a block iterator on the index partition auto block = block_map_->find(handle.offset()); - // This is a possible scenario since block cache might not have had space - // for the partition - if (block != block_map_->end()) { - const Rep* rep = table_->get_rep(); - assert(rep); - - Statistics* kNullStats = nullptr; - // We don't return pinned data from index blocks, so no need - // to set `block_contents_pinned`. - return block->second.GetValue()->NewIndexIterator( - rep->internal_comparator.user_comparator(), - rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, - rep->index_has_first_key, rep->index_key_includes_seq, - rep->index_value_is_full); - } - // Create an empty iterator - // TODO(ajkr): this is not the right way to handle an unpinned partition. - return new IndexBlockIter(); + // block_map_ must be exhaustive + if (block == block_map_->end()) { + assert(false); + // Signal problem to caller + return nullptr; + } + const Rep* rep = table_->get_rep(); + assert(rep); + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return block->second.GetValue()->NewIndexIterator( + rep->internal_comparator.user_comparator(), + rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full); } // This will be broken if the user specifies an unusual implementation diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 75891155d..6393a25ca 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -142,6 +142,8 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { bool index_value_is_full() const; protected: + // For partition blocks pinned in cache. Can be a subset of blocks + // in case some fail insertion on attempt to pin. std::unordered_map> filter_map_; }; diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index acb40f125..5a0e41468 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -114,10 +114,12 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, Statistics* kNullStats = nullptr; CachableEntry index_block; - Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, - &lookup_context, &index_block); - if (!s.ok()) { - return s; + { + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &index_block); + if (!s.ok()) { + return s; + } } // We don't return pinned data from index blocks, so no need @@ -149,23 +151,30 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer, false /*Implicit auto readahead*/); IOOptions opts; - s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len)); - } - if (!s.ok()) { - return s; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len)); + } + if (!s.ok()) { + return s; + } } + // For saving "all or nothing" to partition_map_ + std::unordered_map> map_in_progress; + // After prefetch, read the partitions one by one biter.SeekToFirst(); + size_t partition_count = 0; for (; biter.Valid(); biter.Next()) { handle = biter.value().handle; CachableEntry block; + ++partition_count; // TODO: Support counter batch update for partitioned index and // filter blocks - s = table()->MaybeReadBlockAndLoadToCache( + Status s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), /*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr); @@ -174,14 +183,22 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, return s; } if (block.GetValue() != nullptr) { + // Might need to "pin" some mmap-read blocks (GetOwnValue) if some + // partitions are successfully compressed (cached) and some are not + // compressed (mmap eligible) if (block.IsCached() || block.GetOwnValue()) { if (pin) { - partition_map_[handle.offset()] = std::move(block); + map_in_progress[handle.offset()] = std::move(block); } } } } - return biter.status(); + Status s = biter.status(); + // Save (pin) them only if everything checks out + if (map_in_progress.size() == partition_count && s.ok()) { + std::swap(partition_map_, map_in_progress); + } + return s; } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_index_reader.h b/table/block_based/partitioned_index_reader.h index 7e97f6561..22a0e3d04 100644 --- a/table/block_based/partitioned_index_reader.h +++ b/table/block_based/partitioned_index_reader.h @@ -46,6 +46,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { CachableEntry&& index_block) : IndexReaderCommon(t, std::move(index_block)) {} + // For partition blocks pinned in cache. This is expected to be "all or + // none" so that !partition_map_.empty() can use an iterator expecting + // all partitions to be saved here. std::unordered_map> partition_map_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index d967deff8..a744cf921 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -201,6 +201,10 @@ void TwoLevelIndexIterator::InitDataBlock() { state_->NewSecondaryIterator(handle); data_block_handle_ = handle; SetSecondLevelIterator(iter); + if (iter == nullptr) { + status_ = Status::Corruption("Missing block for partition " + + handle.ToString()); + } } } }