Move the index readers out of the block cache (#5298)

Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).

Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298

Differential Revision: D15303203

Pulled By: ltamasi

fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
main
Levi Tamasi 5 years ago committed by Facebook Github Bot
parent bd44ec2006
commit 1e35584251
  1. 2
      HISTORY.md
  2. 12
      db/db_block_cache_test.cc
  3. 1085
      table/block_based_table_reader.cc
  4. 114
      table/block_based_table_reader.h
  5. 67
      table/table_test.cc

@ -2,6 +2,8 @@
## Unreleased ## Unreleased
### Public API Change ### Public API Change
* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
* Partitions of partitioned indexes no longer affect the read amplification statistics.
* Due to a refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
### New Features ### New Features
* Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.

@ -365,7 +365,10 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
// set the cache capacity to the current usage // set the cache capacity to the current usage
cache->SetCapacity(index_bytes_insert + filter_bytes_insert); cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); // The index eviction statistics were broken by the refactoring that moved
// the index readers out of the block cache. Disabling these until we can
// bring the stats back.
// ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
// Note that the second key needs to be no longer than the first one. // Note that the second key needs to be no longer than the first one.
// Otherwise the second index block may not fit in cache. // Otherwise the second index block may not fit in cache.
@ -377,8 +380,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
index_bytes_insert); index_bytes_insert);
ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
filter_bytes_insert); filter_bytes_insert);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), // The index eviction statistics were broken by the refactoring that moved
index_bytes_insert); // the index readers out of the block cache. Disabling these until we can
// bring the stats back.
// ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
// index_bytes_insert);
ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
filter_bytes_insert); filter_bytes_insert);
} }

File diff suppressed because it is too large Load Diff

@ -150,6 +150,8 @@ class BlockBasedTable : public TableReader {
// be close to the file length. // be close to the file length.
uint64_t ApproximateOffsetOf(const Slice& key) override; uint64_t ApproximateOffsetOf(const Slice& key) override;
bool TEST_BlockInCache(const BlockHandle& handle) const;
// Returns true if the block for the specified key is in cache. // Returns true if the block for the specified key is in cache.
// REQUIRES: key is in this table && block cache enabled // REQUIRES: key is in this table && block cache enabled
bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
@ -173,54 +175,35 @@ class BlockBasedTable : public TableReader {
~BlockBasedTable(); ~BlockBasedTable();
bool TEST_filter_block_preloaded() const; bool TEST_filter_block_preloaded() const;
bool TEST_index_reader_preloaded() const; bool TEST_IndexBlockInCache() const;
// IndexReader is the interface that provide the functionality for index // IndexReader is the interface that provides the functionality for index
// access. // access.
class IndexReader { class IndexReader {
public: public:
explicit IndexReader(const InternalKeyComparator* icomparator, virtual ~IndexReader() = default;
Statistics* stats)
: icomparator_(icomparator), statistics_(stats) {} // Create an iterator for index access. If iter is null, then a new object
// is created on the heap, and the callee will have the ownership.
virtual ~IndexReader() {} // If a non-null iter is passed in, it will be used, and the returned value
// is either the same as iter or a new on-heap object that
// Create an iterator for index access. // wraps the passed iter. In the latter case the return value points
// If iter is null then a new object is created on heap and the callee will // to a different object then iter, and the callee has the ownership of the
// have the ownership. If a non-null iter is passed in it will be used, and
// the returned value is either the same as iter or a new on-heap object
// that
// wrapps the passed iter. In the latter case the return value would point
// to
// a different object then iter and the callee has the ownership of the
// returned object. // returned object.
virtual InternalIteratorBase<BlockHandle>* NewIterator( virtual InternalIteratorBase<BlockHandle>* NewIterator(
IndexBlockIter* iter = nullptr, bool total_order_seek = true, const ReadOptions& read_options, bool disable_prefix_seek,
bool fill_cache = true) = 0; IndexBlockIter* iter, GetContext* get_context) = 0;
// The size of the index.
virtual size_t size() const = 0;
// Memory usage of the index block
virtual size_t usable_size() const = 0;
// return the statistics pointer
virtual Statistics* statistics() const { return statistics_; }
// Report an approximation of how much memory has been used other than // Report an approximation of how much memory has been used other than
// memory // memory that was allocated in block cache.
// that was allocated in block cache.
virtual size_t ApproximateMemoryUsage() const = 0; virtual size_t ApproximateMemoryUsage() const = 0;
// Cache the dependencies of the index reader (e.g. the partitions
virtual void CacheDependencies(bool /* unused */) {} // of a partitioned index).
virtual void CacheDependencies(bool /* pin */) {}
// Prefetch all the blocks referenced by this index to the buffer
void PrefetchBlocks(FilePrefetchBuffer* buf);
protected:
const InternalKeyComparator* icomparator_;
private:
Statistics* statistics_;
}; };
class IndexReaderCommon;
static Slice GetCacheKey(const char* cache_key_prefix, static Slice GetCacheKey(const char* cache_key_prefix,
size_t cache_key_prefix_size, size_t cache_key_prefix_size,
const BlockHandle& handle, char* cache_key); const BlockHandle& handle, char* cache_key);
@ -271,11 +254,22 @@ class BlockBasedTable : public TableReader {
// in uncompressed block cache, also sets cache_handle to reference that // in uncompressed block cache, also sets cache_handle to reference that
// block. // block.
static Status MaybeReadBlockAndLoadToCache( static Status MaybeReadBlockAndLoadToCache(
FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
const BlockHandle& handle, const UncompressionDict& uncompression_dict, const ReadOptions& ro, const BlockHandle& handle,
const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, bool is_index = false, CachableEntry<Block>* block_entry, bool is_index = false,
GetContext* get_context = nullptr); GetContext* get_context = nullptr);
// Similar to the above, with one crucial difference: it will retrieve the
// block from the file even if there are no caches configured (assuming the
// read options allow I/O).
static Status RetrieveBlock(
FilePrefetchBuffer* prefetch_buffer, const Rep* rep,
const ReadOptions& ro, const BlockHandle& handle,
const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, bool is_index,
GetContext* get_context);
// For the following two functions: // For the following two functions:
// if `no_io == true`, we will not try to read filter/index from sst file // if `no_io == true`, we will not try to read filter/index from sst file
// were they not present in cache yet. // were they not present in cache yet.
@ -305,7 +299,6 @@ class BlockBasedTable : public TableReader {
InternalIteratorBase<BlockHandle>* NewIndexIterator( InternalIteratorBase<BlockHandle>* NewIndexIterator(
const ReadOptions& read_options, bool need_upper_bound_check = false, const ReadOptions& read_options, bool need_upper_bound_check = false,
IndexBlockIter* input_iter = nullptr, IndexBlockIter* input_iter = nullptr,
CachableEntry<IndexReader>* index_entry = nullptr,
GetContext* get_context = nullptr); GetContext* get_context = nullptr);
// Read block cache from block caches (if set): block_cache and // Read block cache from block caches (if set): block_cache and
@ -316,7 +309,7 @@ class BlockBasedTable : public TableReader {
// dictionary. // dictionary.
static Status GetDataBlockFromCache( static Status GetDataBlockFromCache(
const Slice& block_cache_key, const Slice& compressed_block_cache_key, const Slice& block_cache_key, const Slice& compressed_block_cache_key,
Cache* block_cache, Cache* block_cache_compressed, Rep* rep, Cache* block_cache, Cache* block_cache_compressed, const Rep* rep,
const ReadOptions& read_options, const ReadOptions& read_options,
CachableEntry<Block>* block, const UncompressionDict& uncompression_dict, CachableEntry<Block>* block, const UncompressionDict& uncompression_dict,
size_t read_amp_bytes_per_bit, bool is_index = false, size_t read_amp_bytes_per_bit, bool is_index = false,
@ -359,9 +352,9 @@ class BlockBasedTable : public TableReader {
// need to access extra meta blocks for index construction. This parameter // need to access extra meta blocks for index construction. This parameter
// helps avoid re-reading meta index block if caller already created one. // helps avoid re-reading meta index block if caller already created one.
Status CreateIndexReader( Status CreateIndexReader(
FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, FilePrefetchBuffer* prefetch_buffer,
InternalIterator* preloaded_meta_index_iter = nullptr, InternalIterator* preloaded_meta_index_iter, bool use_cache,
const int level = -1); bool prefetch, bool pin, IndexReader** index_reader);
bool FullFilterKeyMayMatch( bool FullFilterKeyMayMatch(
const ReadOptions& read_options, FilterBlockReader* filter, const ReadOptions& read_options, FilterBlockReader* filter,
@ -398,9 +391,8 @@ class BlockBasedTable : public TableReader {
static Status PrefetchIndexAndFilterBlocks( static Status PrefetchIndexAndFilterBlocks(
Rep* rep, FilePrefetchBuffer* prefetch_buffer, Rep* rep, FilePrefetchBuffer* prefetch_buffer,
InternalIterator* meta_iter, BlockBasedTable* new_table, InternalIterator* meta_iter, BlockBasedTable* new_table,
const SliceTransform* prefix_extractor, bool prefetch_all, bool prefetch_all, const BlockBasedTableOptions& table_options,
const BlockBasedTableOptions& table_options, const int level, const int level);
const bool prefetch_index_and_filter_in_cache);
Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter); Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter); Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
@ -411,7 +403,7 @@ class BlockBasedTable : public TableReader {
const bool is_a_filter_partition, const bool is_a_filter_partition,
const SliceTransform* prefix_extractor = nullptr) const; const SliceTransform* prefix_extractor = nullptr) const;
static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); static void SetupCacheKeyPrefix(Rep* rep);
// Generate a cache key prefix from the file // Generate a cache key prefix from the file
static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file, static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
@ -486,18 +478,21 @@ struct BlockBasedTable::Rep {
size_t persistent_cache_key_prefix_size = 0; size_t persistent_cache_key_prefix_size = 0;
char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t compressed_cache_key_prefix_size = 0; size_t compressed_cache_key_prefix_size = 0;
uint64_t dummy_index_reader_offset =
0; // ID that is unique for the block cache.
PersistentCacheOptions persistent_cache_options; PersistentCacheOptions persistent_cache_options;
// Footer contains the fixed table information // Footer contains the fixed table information
Footer footer; Footer footer;
// `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e., // `filter` and `uncompression_dict` will be populated (i.e., non-nullptr)
// non-nullptr) and used only when options.block_cache is nullptr or when // and used only when options.block_cache is nullptr or when
// `cache_index_and_filter_blocks == false`. Otherwise, we will get the index, // `cache_index_and_filter_blocks == false`. Otherwise, we will get the
// filter, and compression dictionary blocks via the block cache. In that case // filter and compression dictionary blocks via the block cache. In that case,
// `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle` // `filter_handle`, and `compression_dict_handle` are used to lookup these
// are used to lookup these meta-blocks in block cache. // meta-blocks in block cache.
//
// Note: the IndexReader object is always stored in this member variable;
// the index block itself, however, may or may not be in the block cache
// based on the settings above. We plan to change the handling of the
// filter and compression dictionary similarly.
std::unique_ptr<IndexReader> index_reader; std::unique_ptr<IndexReader> index_reader;
std::unique_ptr<FilterBlockReader> filter; std::unique_ptr<FilterBlockReader> filter;
std::unique_ptr<UncompressionDict> uncompression_dict; std::unique_ptr<UncompressionDict> uncompression_dict;
@ -526,12 +521,11 @@ struct BlockBasedTable::Rep {
// only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
// true or in all levels when pin_top_level_index_and_filter is set in // true or in all levels when pin_top_level_index_and_filter is set in
// combination with partitioned index/filters: then we do use the LRU cache, // combination with partitioned filters: then we do use the LRU cache,
// but we always keep the filter & index block's handle checked out here (=we // but we always keep the filter block's handle checked out here (=we
// don't call Release()), plus the parsed out objects the LRU cache will never // don't call Release()), plus the parsed out objects the LRU cache will never
// push flush them out, hence they're pinned // push flush them out, hence they're pinned
CachableEntry<FilterBlockReader> filter_entry; CachableEntry<FilterBlockReader> filter_entry;
CachableEntry<IndexReader> index_entry;
std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels; std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
// If global_seqno is used, all Keys in this file will have the same // If global_seqno is used, all Keys in this file will have the same

@ -1993,7 +1993,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
// preloading filter/index blocks is enabled. // preloading filter/index blocks is enabled.
auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
ASSERT_TRUE(reader->TEST_filter_block_preloaded()); ASSERT_TRUE(reader->TEST_filter_block_preloaded());
ASSERT_TRUE(reader->TEST_index_reader_preloaded()); ASSERT_FALSE(reader->TEST_IndexBlockInCache());
{ {
// nothing happens in the beginning // nothing happens in the beginning
@ -2040,7 +2040,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
// preloading filter/index blocks is prohibited. // preloading filter/index blocks is prohibited.
auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); ASSERT_TRUE(reader->TEST_IndexBlockInCache());
// -- PART 1: Open with regular block cache. // -- PART 1: Open with regular block cache.
// Since block_cache is disabled, no cache activities will be involved. // Since block_cache is disabled, no cache activities will be involved.
@ -2612,69 +2612,6 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) {
EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0); EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
} }
TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) {
// A regression test to avoid data race described in
// https://github.com/facebook/rocksdb/issues/1267
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
std::vector<std::string> keys;
stl_wrappers::KVMap kvmap;
c.Add("a1", "val1");
Options options;
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
table_options.index_type = BlockBasedTableOptions::kHashSearch;
table_options.cache_index_and_filter_blocks = true;
table_options.block_cache = NewLRUCache(0);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
const ImmutableCFOptions ioptions(options);
const MutableCFOptions moptions(options);
c.Finish(options, ioptions, moptions, table_options,
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
{
{"BlockBasedTable::NewIndexIterator::thread1:1",
"BlockBasedTable::NewIndexIterator::thread2:2"},
{"BlockBasedTable::NewIndexIterator::thread2:3",
"BlockBasedTable::NewIndexIterator::thread1:4"},
},
{
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
"BlockBasedTable::NewIndexIterator::thread1:1"},
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
"BlockBasedTable::NewIndexIterator::thread1:4"},
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
"BlockBasedTable::NewIndexIterator::thread2:2"},
{"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
"BlockBasedTable::NewIndexIterator::thread2:3"},
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
ReadOptions ro;
auto* reader = c.GetTableReader();
std::function<void()> func1 = [&]() {
TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker");
// TODO(Zhongyi): update test to use MutableCFOptions
std::unique_ptr<InternalIterator> iter(
reader->NewIterator(ro, moptions.prefix_extractor.get()));
iter->Seek(InternalKey("a1", 0, kTypeValue).Encode());
};
std::function<void()> func2 = [&]() {
TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker");
std::unique_ptr<InternalIterator> iter(
reader->NewIterator(ro, moptions.prefix_extractor.get()));
};
auto thread1 = port::Thread(func1);
auto thread2 = port::Thread(func2);
thread1.join();
thread2.join();
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
c.ResetTableReader();
}
// Plain table is not supported in ROCKSDB_LITE // Plain table is not supported in ROCKSDB_LITE
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
TEST_F(PlainTableTest, BasicPlainTableProperties) { TEST_F(PlainTableTest, BasicPlainTableProperties) {

Loading…
Cancel
Save