Insert range deletion meta-block into block cache

Summary:
This handles two issues: (1) range deletion iterator sometimes outlives
the table reader that created it, in which case the block must not be destroyed
during table reader destruction; and (2) we prefer to read these range tombstone
meta-blocks from file fewer times.

- Extracted cache-populating logic from NewDataBlockIterator() into a separate function: MaybeLoadDataBlockToCache()
- Use MaybeLoadDataBlockToCache() to load range deletion meta-block and pin it through the reader's lifetime. This code reuse works since range deletion meta-block has same format as data blocks.
- Use NewDataBlockIterator() to create range deletion iterators, which uses block cache if enabled, otherwise reads the block from file. Either way, the underlying block won't disappear until after the iterator is destroyed.
Closes https://github.com/facebook/rocksdb/pull/1459

Differential Revision: D4123175

Pulled By: ajkr

fbshipit-source-id: 8f64281
main
Andrew Kryczka 8 years ago committed by Facebook Github Bot
parent 9e7cf3469b
commit 815f54afad
  1. 153
      table/block_based_table_reader.cc
  2. 12
      table/block_based_table_reader.h
  3. 3
      table/format.cc
  4. 7
      table/format.h
  5. 3
      table/table_properties.cc
  6. 38
      table/table_test.cc

@ -371,7 +371,7 @@ struct BlockBasedTable::Rep {
filter_type(FilterType::kNoFilter),
whole_key_filtering(_table_opt.whole_key_filtering),
prefix_filtering(true),
range_del_block(nullptr),
range_del_handle(BlockHandle::NullBlockHandle()),
global_seqno(kDisableGlobalSequenceNumber) {}
const ImmutableCFOptions& ioptions;
@ -430,7 +430,10 @@ struct BlockBasedTable::Rep {
// the LRU cache will never push flush them out, hence they're pinned
CachableEntry<FilterBlockReader> filter_entry;
CachableEntry<IndexReader> index_entry;
unique_ptr<Block> range_del_block;
// range deletion meta-block is pinned through reader's lifetime when LRU
// cache is enabled.
CachableEntry<Block> range_del_entry;
BlockHandle range_del_handle;
// If global_seqno is used, all Keys in this file will have the same
// seqno with value `global_seqno`.
@ -702,29 +705,23 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
}
// Read the range del meta block
// TODO(wanning&andrewkr): cache range delete tombstone block
bool found_range_del_block;
BlockHandle range_del_handle;
s = SeekToRangeDelBlock(meta_iter.get(), &found_range_del_block,
&range_del_handle);
&rep->range_del_handle);
if (!s.ok()) {
Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
"Error when seeking to range delete tombstones block from file: %s",
s.ToString().c_str());
} else {
if (found_range_del_block && !range_del_handle.IsNull()) {
BlockContents range_del_block_contents;
if (found_range_del_block && !rep->range_del_handle.IsNull()) {
ReadOptions read_options;
s = ReadBlockContents(rep->file.get(), rep->footer, read_options,
range_del_handle, &range_del_block_contents,
rep->ioptions, false /* decompressed */);
s = MaybeLoadDataBlockToCache(rep, read_options, rep->range_del_handle,
Slice() /* compression_dict */,
&rep->range_del_entry);
if (!s.ok()) {
Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
"Encountered error while reading data from range del block %s",
s.ToString().c_str());
} else {
rep->range_del_block.reset(new Block(
std::move(range_del_block_contents), kDisableGlobalSequenceNumber));
}
}
}
@ -1241,30 +1238,74 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
const bool no_io = (ro.read_tier == kBlockCacheTier);
Cache* block_cache = rep->table_options.block_cache.get();
Cache* block_cache_compressed =
rep->table_options.block_cache_compressed.get();
CachableEntry<Block> block;
BlockHandle handle;
Slice input = index_value;
// We intentionally allow extra stuff in index_value so that we
// can add more features in the future.
Status s = handle.DecodeFrom(&input);
Slice compression_dict;
if (s.ok()) {
if (rep->compression_dict_block) {
compression_dict = rep->compression_dict_block->data;
}
s = MaybeLoadDataBlockToCache(rep, ro, handle, compression_dict, &block);
}
if (!s.ok()) {
// Didn't get any data from block caches.
if (s.ok() && block.value == nullptr) {
if (no_io) {
// Could not read from block_cache and can't do IO
if (input_iter != nullptr) {
input_iter->SetStatus(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
}
}
std::unique_ptr<Block> block_value;
s = ReadBlockFromFile(
rep->file.get(), rep->footer, ro, handle, &block_value, rep->ioptions,
true /* compress */, compression_dict, rep->persistent_cache_options,
rep->global_seqno, rep->table_options.read_amp_bytes_per_bit);
if (s.ok()) {
block.value = block_value.release();
}
}
InternalIterator* iter;
if (s.ok()) {
assert(block.value != nullptr);
iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true,
rep->ioptions.statistics);
if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle);
} else {
iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
}
} else {
assert(block.value == nullptr);
if (input_iter != nullptr) {
input_iter->SetStatus(s);
return input_iter;
iter = input_iter;
} else {
return NewErrorInternalIterator(s);
iter = NewErrorInternalIterator(s);
}
}
return iter;
}
Status BlockBasedTable::MaybeLoadDataBlockToCache(
Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
Slice compression_dict, CachableEntry<Block>* block_entry) {
const bool no_io = (ro.read_tier == kBlockCacheTier);
Cache* block_cache = rep->table_options.block_cache.get();
Cache* block_cache_compressed =
rep->table_options.block_cache_compressed.get();
Slice compression_dict;
if (rep->compression_dict_block) {
compression_dict = rep->compression_dict_block->data;
}
// If either block cache is enabled, we'll try to read from it.
Status s;
if (block_cache != nullptr || block_cache_compressed != nullptr) {
Statistics* statistics = rep->ioptions.statistics;
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
@ -1286,10 +1327,10 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
s = GetDataBlockFromCache(
key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro,
&block, rep->table_options.format_version, compression_dict,
block_entry, rep->table_options.format_version, compression_dict,
rep->table_options.read_amp_bytes_per_bit);
if (block.value == nullptr && !no_io && ro.fill_cache) {
if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
std::unique_ptr<Block> raw_block;
{
StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
@ -1303,54 +1344,12 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
if (s.ok()) {
s = PutDataBlockToCache(
key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
&block, raw_block.release(), rep->table_options.format_version,
block_entry, raw_block.release(), rep->table_options.format_version,
compression_dict, rep->table_options.read_amp_bytes_per_bit);
}
}
}
// Didn't get any data from block caches.
if (s.ok() && block.value == nullptr) {
if (no_io) {
// Could not read from block_cache and can't do IO
if (input_iter != nullptr) {
input_iter->SetStatus(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
}
}
std::unique_ptr<Block> block_value;
s = ReadBlockFromFile(
rep->file.get(), rep->footer, ro, handle, &block_value, rep->ioptions,
true /* compress */, compression_dict, rep->persistent_cache_options,
rep->global_seqno, rep->table_options.read_amp_bytes_per_bit);
if (s.ok()) {
block.value = block_value.release();
}
}
InternalIterator* iter;
if (s.ok()) {
assert(block.value != nullptr);
iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true,
rep->ioptions.statistics);
if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle);
} else {
iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
}
} else {
assert(block.value == nullptr);
if (input_iter != nullptr) {
input_iter->SetStatus(s);
iter = input_iter;
} else {
iter = NewErrorInternalIterator(s);
}
}
return iter;
return s;
}
class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
@ -1489,12 +1488,15 @@ InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
const ReadOptions& read_options) {
if (rep_->range_del_block.get() != nullptr) {
auto iter =
rep_->range_del_block->NewIterator(&(rep_->internal_comparator));
return iter;
}
return NewEmptyInternalIterator();
if (rep_->range_del_handle.IsNull()) {
return NewEmptyInternalIterator();
}
std::string str;
rep_->range_del_handle.EncodeTo(&str);
// Even though range_del_entry already references the meta-block when block
// cache is enabled, we still call the below function to get another reference
// since the caller may need the iterator beyond this table reader's lifetime.
return NewDataBlockIterator(rep_, read_options, Slice(str));
}
bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options,
@ -1968,6 +1970,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
void BlockBasedTable::Close() {
rep_->filter_entry.Release(rep_->table_options.block_cache.get());
rep_->index_entry.Release(rep_->table_options.block_cache.get());
rep_->range_del_entry.Release(rep_->table_options.block_cache.get());
// cleanup index and filter blocks to avoid accessing dangling pointer
if (!rep_->table_options.no_block_cache) {
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];

@ -162,6 +162,18 @@ class BlockBasedTable : public TableReader {
static InternalIterator* NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const Slice& index_value,
BlockIter* input_iter = nullptr);
// If block cache enabled (compressed or uncompressed), looks for the block
// identified by handle in (1) uncompressed cache, (2) compressed cache, and
// then (3) file. If found, inserts into the cache(s) that were searched
// unsuccessfully (e.g., if found in file, will add to both uncompressed and
// compressed caches if they're enabled).
//
// @param block_entry value is set to the uncompressed block if found. If
// in uncompressed block cache, also sets cache_handle to reference that
// block.
static Status MaybeLoadDataBlockToCache(
Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
Slice compression_dict, CachableEntry<Block>* block_entry);
// For the following two functions:
// if `no_io == true`, we will not try to read filter/index from sst file

@ -59,6 +59,9 @@ Status BlockHandle::DecodeFrom(Slice* input) {
GetVarint64(input, &size_)) {
return Status::OK();
} else {
// reset in case failure after partially decoding
offset_ = 0;
size_ = 0;
return Status::Corruption("bad block handle");
}
}

@ -65,8 +65,8 @@ class BlockHandle {
enum { kMaxEncodedLength = 10 + 10 };
private:
uint64_t offset_ = 0;
uint64_t size_ = 0;
uint64_t offset_;
uint64_t size_;
static const BlockHandle kNullBlockHandle;
};
@ -242,6 +242,9 @@ extern Status UncompressBlockContentsForCompressionType(
// Implementation details follow. Clients should ignore,
// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
// uninitialized.
inline BlockHandle::BlockHandle()
: BlockHandle(~static_cast<uint64_t>(0),
~static_cast<uint64_t>(0)) {

@ -47,6 +47,9 @@ namespace {
Status SeekToMetaBlock(InternalIterator* meta_iter,
const std::string& block_name, bool* is_found,
BlockHandle* block_handle = nullptr) {
if (block_handle != nullptr) {
*block_handle = BlockHandle::NullBlockHandle();
}
*is_found = true;
meta_iter->Seek(block_name);
if (meta_iter->status().ok()) {

@ -1177,23 +1177,29 @@ TEST_F(BlockBasedTableTest, RangeDelBlock) {
c.Finish(options, ioptions, table_options, *internal_cmp, &sorted_keys,
&kvmap);
std::unique_ptr<InternalIterator> iter(
c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
ASSERT_EQ(false, iter->Valid());
iter->SeekToFirst();
ASSERT_EQ(true, iter->Valid());
for (int i = 0; i < 2; i++) {
ASSERT_TRUE(iter->Valid());
ParsedInternalKey parsed_key;
ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
RangeTombstone t(parsed_key, iter->value());
ASSERT_EQ(t.start_key_, keys[i]);
ASSERT_EQ(t.end_key_, vals[i]);
ASSERT_EQ(t.seq_, i);
iter->Next();
for (int j = 0; j < 2; ++j) {
std::unique_ptr<InternalIterator> iter(
c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
if (j > 0) {
// For second iteration, delete the table reader object and verify the
// iterator can still access its metablock's range tombstones.
c.ResetTableReader();
}
ASSERT_EQ(false, iter->Valid());
iter->SeekToFirst();
ASSERT_EQ(true, iter->Valid());
for (int i = 0; i < 2; i++) {
ASSERT_TRUE(iter->Valid());
ParsedInternalKey parsed_key;
ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
RangeTombstone t(parsed_key, iter->value());
ASSERT_EQ(t.start_key_, keys[i]);
ASSERT_EQ(t.end_key_, vals[i]);
ASSERT_EQ(t.seq_, i);
iter->Next();
}
ASSERT_TRUE(!iter->Valid());
}
ASSERT_TRUE(!iter->Valid());
c.ResetTableReader();
}
TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) {

Loading…
Cancel
Save