Move the index readers out of the block cache (#5298)

Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).

Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298

Differential Revision: D15303203

Pulled By: ltamasi

fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
main
Levi Tamasi 6 years ago committed by Facebook Github Bot
parent bd44ec2006
commit 1e35584251
  1. 2
      HISTORY.md
  2. 12
      db/db_block_cache_test.cc
  3. 985
      table/block_based_table_reader.cc
  4. 114
      table/block_based_table_reader.h
  5. 67
      table/table_test.cc

@ -2,6 +2,8 @@
## Unreleased
### Public API Change
* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
* Partitions of partitioned indexes no longer affect the read amplification statistics.
* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
### New Features
* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.

@ -365,7 +365,10 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
// set the cache capacity to the current usage
cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
// The index eviction statistics were broken by the refactoring that moved
// the index readers out of the block cache. Disabling these until we can
// bring the stats back.
// ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
// Note that the second key needs to be no longer than the first one.
// Otherwise the second index block may not fit in cache.
@ -377,8 +380,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
index_bytes_insert);
ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
filter_bytes_insert);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
index_bytes_insert);
// The index eviction statistics were broken by the refactoring that moved
// the index readers out of the block cache. Disabling these until we can
// bring the stats back.
// ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
// index_bytes_insert);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
filter_bytes_insert);
}

File diff suppressed because it is too large Load Diff

@ -150,6 +150,8 @@ class BlockBasedTable : public TableReader {
// be close to the file length.
uint64_t ApproximateOffsetOf(const Slice& key) override;
bool TEST_BlockInCache(const BlockHandle& handle) const;
// Returns true if the block for the specified key is in cache.
// REQUIRES: key is in this table && block cache enabled
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
@ -173,54 +175,35 @@ class BlockBasedTable : public TableReader {
~BlockBasedTable();
bool TEST_filter_block_preloaded() const;
bool TEST_index_reader_preloaded() const;
bool TEST_IndexBlockInCache() const;
// IndexReader is the interface that provide the functionality for index
// IndexReader is the interface that provides the functionality for index
// access.
class IndexReader {
public:
explicit IndexReader(const InternalKeyComparator* icomparator,
Statistics* stats)
: icomparator_(icomparator), statistics_(stats) {}
virtual ~IndexReader() {}
// Create an iterator for index access.
// If iter is null then a new object is created on heap and the callee will
// have the ownership. If a non-null iter is passed in it will be used, and
// the returned value is either the same as iter or a new on-heap object
// that
// wrapps the passed iter. In the latter case the return value would point
// to
// a different object then iter and the callee has the ownership of the
virtual ~IndexReader() = default;
// Create an iterator for index access. If iter is null, then a new object
// is created on the heap, and the callee will have the ownership.
// If a non-null iter is passed in, it will be used, and the returned value
// is either the same as iter or a new on-heap object that
// wraps the passed iter. In the latter case the return value points
// to a different object then iter, and the callee has the ownership of the
// returned object.
virtual InternalIteratorBase<BlockHandle>* NewIterator(
IndexBlockIter* iter = nullptr, bool total_order_seek = true,
bool fill_cache = true) = 0;
// The size of the index.
virtual size_t size() const = 0;
// Memory usage of the index block
virtual size_t usable_size() const = 0;
// return the statistics pointer
virtual Statistics* statistics() const { return statistics_; }
const ReadOptions& read_options, bool disable_prefix_seek,
IndexBlockIter* iter, GetContext* get_context) = 0;
// Report an approximation of how much memory has been used other than
// memory
// that was allocated in block cache.
// memory that was allocated in block cache.
virtual size_t ApproximateMemoryUsage() const = 0;
virtual void CacheDependencies(bool /* unused */) {}
// Prefetch all the blocks referenced by this index to the buffer
void PrefetchBlocks(FilePrefetchBuffer* buf);
protected:
const InternalKeyComparator* icomparator_;
private:
Statistics* statistics_;
// Cache the dependencies of the index reader (e.g. the partitions
// of a partitioned index).
virtual void CacheDependencies(bool /* pin */) {}
};
class IndexReaderCommon;
static Slice GetCacheKey(const char* cache_key_prefix,
size_t cache_key_prefix_size,
const BlockHandle& handle, char* cache_key);
@ -271,11 +254,22 @@ class BlockBasedTable : public TableReader {
// in uncompressed block cache, also sets cache_handle to reference that
// block.
static Status MaybeReadBlockAndLoadToCache(
FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
const ReadOptions& ro, const BlockHandle& handle,
const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, bool is_index = false,
GetContext* get_context = nullptr);
// Similar to the above, with one crucial difference: it will retrieve the
// block from the file even if there are no caches configured (assuming the
// read options allow I/O).
static Status RetrieveBlock(
FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
const ReadOptions& ro, const BlockHandle& handle,
const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, bool is_index,
GetContext* get_context);
// For the following two functions:
// if `no_io == true`, we will not try to read filter/index from sst file
// were they not present in cache yet.
@ -305,7 +299,6 @@ class BlockBasedTable : public TableReader {
InternalIteratorBase<BlockHandle>* NewIndexIterator(
const ReadOptions& read_options, bool need_upper_bound_check = false,
IndexBlockIter* input_iter = nullptr,
CachableEntry<IndexReader>* index_entry = nullptr,
GetContext* get_context = nullptr);
// Read block cache from block caches (if set): block_cache and
@ -316,7 +309,7 @@ class BlockBasedTable : public TableReader {
// dictionary.
static Status GetDataBlockFromCache(
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
const ReadOptions& read_options,
CachableEntry<Block>* block, const UncompressionDict& uncompression_dict,
size_t read_amp_bytes_per_bit, bool is_index = false,
@ -359,9 +352,9 @@ class BlockBasedTable : public TableReader {
// need to access extra meta blocks for index construction. This parameter
// helps avoid re-reading meta index block if caller already created one.
Status CreateIndexReader(
FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
InternalIterator* preloaded_meta_index_iter = nullptr,
const int level = -1);
FilePrefetchBuffer* prefetch_buffer,
InternalIterator* preloaded_meta_index_iter, bool use_cache,
bool prefetch, bool pin, IndexReader** index_reader);
bool FullFilterKeyMayMatch(
const ReadOptions& read_options, FilterBlockReader* filter,
@ -398,9 +391,8 @@ class BlockBasedTable : public TableReader {
static Status PrefetchIndexAndFilterBlocks(
Rep* rep, FilePrefetchBuffer* prefetch_buffer,
InternalIterator* meta_iter, BlockBasedTable* new_table,
const SliceTransform* prefix_extractor, bool prefetch_all,
const BlockBasedTableOptions& table_options, const int level,
const bool prefetch_index_and_filter_in_cache);
bool prefetch_all, const BlockBasedTableOptions& table_options,
const int level);
Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
@ -411,7 +403,7 @@ class BlockBasedTable : public TableReader {
const bool is_a_filter_partition,
const SliceTransform* prefix_extractor = nullptr) const;
static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
static void SetupCacheKeyPrefix(Rep* rep);
// Generate a cache key prefix from the file
static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
@ -486,18 +478,21 @@ struct BlockBasedTable::Rep {
size_t persistent_cache_key_prefix_size = 0;
char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t compressed_cache_key_prefix_size = 0;
uint64_t dummy_index_reader_offset =
0; // ID that is unique for the block cache.
PersistentCacheOptions persistent_cache_options;
// Footer contains the fixed table information
Footer footer;
// `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e.,
// non-nullptr) and used only when options.block_cache is nullptr or when
// `cache_index_and_filter_blocks == false`. Otherwise, we will get the index,
// filter, and compression dictionary blocks via the block cache. In that case
// `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle`
// are used to lookup these meta-blocks in block cache.
// `filter` and `uncompression_dict` will be populated (i.e., non-nullptr)
// and used only when options.block_cache is nullptr or when
// `cache_index_and_filter_blocks == false`. Otherwise, we will get the
// filter and compression dictionary blocks via the block cache. In that case,
// `filter_handle`, and `compression_dict_handle` are used to lookup these
// meta-blocks in block cache.
//
// Note: the IndexReader object is always stored in this member variable;
// the index block itself, however, may or may not be in the block cache
// based on the settings above. We plan to change the handling of the
// filter and compression dictionary similarly.
std::unique_ptr<IndexReader> index_reader;
std::unique_ptr<FilterBlockReader> filter;
std::unique_ptr<UncompressionDict> uncompression_dict;
@ -526,12 +521,11 @@ struct BlockBasedTable::Rep {
// only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
// true or in all levels when pin_top_level_index_and_filter is set in
// combination with partitioned index/filters: then we do use the LRU cache,
// but we always keep the filter & index block's handle checked out here (=we
// combination with partitioned filters: then we do use the LRU cache,
// but we always keep the filter block's handle checked out here (=we
// don't call Release()), plus the parsed out objects the LRU cache will never
// push flush them out, hence they're pinned
CachableEntry<FilterBlockReader> filter_entry;
CachableEntry<IndexReader> index_entry;
std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
// If global_seqno is used, all Keys in this file will have the same

@ -1993,7 +1993,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
// preloading filter/index blocks is enabled.
auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
ASSERT_TRUE(reader->TEST_filter_block_preloaded());
ASSERT_TRUE(reader->TEST_index_reader_preloaded());
ASSERT_FALSE(reader->TEST_IndexBlockInCache());
{
// nothing happens in the beginning
@ -2040,7 +2040,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
// preloading filter/index blocks is prohibited.
auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
ASSERT_TRUE(reader->TEST_IndexBlockInCache());
// -- PART 1: Open with regular block cache.
// Since block_cache is disabled, no cache activities will be involved.
@ -2612,69 +2612,6 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) {
EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
}
TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) {
// A regression test to avoid data race described in
// https://github.com/facebook/rocksdb/issues/1267
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
std::vector<std::string> keys;
stl_wrappers::KVMap kvmap;
c.Add("a1", "val1");
Options options;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
table_options.index_type = BlockBasedTableOptions::kHashSearch;
table_options.cache_index_and_filter_blocks = true;
table_options.block_cache = NewLRUCache(0);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
const ImmutableCFOptions ioptions(options);
const MutableCFOptions moptions(options);
c.Finish(options, ioptions, moptions, table_options,
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
{
{"BlockBasedTable::NewIndexIterator::thread1:1",
"BlockBasedTable::NewIndexIterator::thread2:2"},
{"BlockBasedTable::NewIndexIterator::thread2:3",
"BlockBasedTable::NewIndexIterator::thread1:4"},
},
{
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
"BlockBasedTable::NewIndexIterator::thread1:1"},
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
"BlockBasedTable::NewIndexIterator::thread1:4"},
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
"BlockBasedTable::NewIndexIterator::thread2:2"},
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
"BlockBasedTable::NewIndexIterator::thread2:3"},
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
ReadOptions ro;
auto* reader = c.GetTableReader();
std::function<void()> func1 = [&]() {
TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker");
// TODO(Zhongyi): update test to use MutableCFOptions
std::unique_ptr<InternalIterator> iter(
reader->NewIterator(ro, moptions.prefix_extractor.get()));
iter->Seek(InternalKey("a1", 0, kTypeValue).Encode());
};
std::function<void()> func2 = [&]() {
TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker");
std::unique_ptr<InternalIterator> iter(
reader->NewIterator(ro, moptions.prefix_extractor.get()));
};
auto thread1 = port::Thread(func1);
auto thread2 = port::Thread(func2);
thread1.join();
thread2.join();
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
c.ResetTableReader();
}
// Plain table is not supported in ROCKSDB_LITE
#ifndef ROCKSDB_LITE
TEST_F(PlainTableTest, BasicPlainTableProperties) {

Loading…
Cancel
Save