diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index e24fd4056..f6a84b3a8 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -371,7 +371,7 @@ struct BlockBasedTable::Rep { filter_type(FilterType::kNoFilter), whole_key_filtering(_table_opt.whole_key_filtering), prefix_filtering(true), - range_del_block(nullptr), + range_del_handle(BlockHandle::NullBlockHandle()), global_seqno(kDisableGlobalSequenceNumber) {} const ImmutableCFOptions& ioptions; @@ -430,7 +430,10 @@ struct BlockBasedTable::Rep { // the LRU cache will never push flush them out, hence they're pinned CachableEntry filter_entry; CachableEntry index_entry; - unique_ptr range_del_block; + // range deletion meta-block is pinned through reader's lifetime when LRU + // cache is enabled. + CachableEntry range_del_entry; + BlockHandle range_del_handle; // If global_seqno is used, all Keys in this file will have the same // seqno with value `global_seqno`. @@ -702,29 +705,23 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, } // Read the range del meta block - // TODO(wanning&andrewkr): cache range delete tombstone block bool found_range_del_block; - BlockHandle range_del_handle; s = SeekToRangeDelBlock(meta_iter.get(), &found_range_del_block, - &range_del_handle); + &rep->range_del_handle); if (!s.ok()) { Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log, "Error when seeking to range delete tombstones block from file: %s", s.ToString().c_str()); } else { - if (found_range_del_block && !range_del_handle.IsNull()) { - BlockContents range_del_block_contents; + if (found_range_del_block && !rep->range_del_handle.IsNull()) { ReadOptions read_options; - s = ReadBlockContents(rep->file.get(), rep->footer, read_options, - range_del_handle, &range_del_block_contents, - rep->ioptions, false /* decompressed */); + s = MaybeLoadDataBlockToCache(rep, read_options, rep->range_del_handle, + Slice() /* compression_dict */, + &rep->range_del_entry); if (!s.ok()) { Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log, "Encountered error while reading data from range del block %s", s.ToString().c_str()); - } else { - rep->range_del_block.reset(new Block( - std::move(range_del_block_contents), kDisableGlobalSequenceNumber)); } } } @@ -1241,30 +1238,74 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator( const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep->table_options.block_cache.get(); - Cache* block_cache_compressed = - rep->table_options.block_cache_compressed.get(); CachableEntry block; - BlockHandle handle; Slice input = index_value; // We intentionally allow extra stuff in index_value so that we // can add more features in the future. Status s = handle.DecodeFrom(&input); + Slice compression_dict; + if (s.ok()) { + if (rep->compression_dict_block) { + compression_dict = rep->compression_dict_block->data; + } + s = MaybeLoadDataBlockToCache(rep, ro, handle, compression_dict, &block); + } - if (!s.ok()) { + // Didn't get any data from block caches. + if (s.ok() && block.value == nullptr) { + if (no_io) { + // Could not read from block_cache and can't do IO + if (input_iter != nullptr) { + input_iter->SetStatus(Status::Incomplete("no blocking io")); + return input_iter; + } else { + return NewErrorInternalIterator(Status::Incomplete("no blocking io")); + } + } + std::unique_ptr block_value; + s = ReadBlockFromFile( + rep->file.get(), rep->footer, ro, handle, &block_value, rep->ioptions, + true /* compress */, compression_dict, rep->persistent_cache_options, + rep->global_seqno, rep->table_options.read_amp_bytes_per_bit); + if (s.ok()) { + block.value = block_value.release(); + } + } + + InternalIterator* iter; + if (s.ok()) { + assert(block.value != nullptr); + iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true, + rep->ioptions.statistics); + if (block.cache_handle != nullptr) { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, + block.cache_handle); + } else { + iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); + } + } else { + assert(block.value == nullptr); if (input_iter != nullptr) { input_iter->SetStatus(s); - return input_iter; + iter = input_iter; } else { - return NewErrorInternalIterator(s); + iter = NewErrorInternalIterator(s); } } + return iter; +} + +Status BlockBasedTable::MaybeLoadDataBlockToCache( + Rep* rep, const ReadOptions& ro, const BlockHandle& handle, + Slice compression_dict, CachableEntry* block_entry) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + Cache* block_cache = rep->table_options.block_cache.get(); + Cache* block_cache_compressed = + rep->table_options.block_cache_compressed.get(); - Slice compression_dict; - if (rep->compression_dict_block) { - compression_dict = rep->compression_dict_block->data; - } // If either block cache is enabled, we'll try to read from it. + Status s; if (block_cache != nullptr || block_cache_compressed != nullptr) { Statistics* statistics = rep->ioptions.statistics; char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; @@ -1286,10 +1327,10 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator( s = GetDataBlockFromCache( key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro, - &block, rep->table_options.format_version, compression_dict, + block_entry, rep->table_options.format_version, compression_dict, rep->table_options.read_amp_bytes_per_bit); - if (block.value == nullptr && !no_io && ro.fill_cache) { + if (block_entry->value == nullptr && !no_io && ro.fill_cache) { std::unique_ptr raw_block; { StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); @@ -1303,54 +1344,12 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator( if (s.ok()) { s = PutDataBlockToCache( key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions, - &block, raw_block.release(), rep->table_options.format_version, + block_entry, raw_block.release(), rep->table_options.format_version, compression_dict, rep->table_options.read_amp_bytes_per_bit); } } } - - // Didn't get any data from block caches. - if (s.ok() && block.value == nullptr) { - if (no_io) { - // Could not read from block_cache and can't do IO - if (input_iter != nullptr) { - input_iter->SetStatus(Status::Incomplete("no blocking io")); - return input_iter; - } else { - return NewErrorInternalIterator(Status::Incomplete("no blocking io")); - } - } - std::unique_ptr block_value; - s = ReadBlockFromFile( - rep->file.get(), rep->footer, ro, handle, &block_value, rep->ioptions, - true /* compress */, compression_dict, rep->persistent_cache_options, - rep->global_seqno, rep->table_options.read_amp_bytes_per_bit); - if (s.ok()) { - block.value = block_value.release(); - } - } - - InternalIterator* iter; - if (s.ok()) { - assert(block.value != nullptr); - iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true, - rep->ioptions.statistics); - if (block.cache_handle != nullptr) { - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, - block.cache_handle); - } else { - iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); - } - } else { - assert(block.value == nullptr); - if (input_iter != nullptr) { - input_iter->SetStatus(s); - iter = input_iter; - } else { - iter = NewErrorInternalIterator(s); - } - } - return iter; + return s; } class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { @@ -1489,12 +1488,15 @@ InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, InternalIterator* BlockBasedTable::NewRangeTombstoneIterator( const ReadOptions& read_options) { - if (rep_->range_del_block.get() != nullptr) { - auto iter = - rep_->range_del_block->NewIterator(&(rep_->internal_comparator)); - return iter; - } - return NewEmptyInternalIterator(); + if (rep_->range_del_handle.IsNull()) { + return NewEmptyInternalIterator(); + } + std::string str; + rep_->range_del_handle.EncodeTo(&str); + // Even though range_del_entry already references the meta-block when block + // cache is enabled, we still call the below function to get another reference + // since the caller may need the iterator beyond this table reader's lifetime. + return NewDataBlockIterator(rep_, read_options, Slice(str)); } bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options, @@ -1968,6 +1970,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { void BlockBasedTable::Close() { rep_->filter_entry.Release(rep_->table_options.block_cache.get()); rep_->index_entry.Release(rep_->table_options.block_cache.get()); + rep_->range_del_entry.Release(rep_->table_options.block_cache.get()); // cleanup index and filter blocks to avoid accessing dangling pointer if (!rep_->table_options.no_block_cache) { char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 94561059d..e2dd9a416 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -162,6 +162,18 @@ class BlockBasedTable : public TableReader { static InternalIterator* NewDataBlockIterator( Rep* rep, const ReadOptions& ro, const Slice& index_value, BlockIter* input_iter = nullptr); + // If block cache enabled (compressed or uncompressed), looks for the block + // identified by handle in (1) uncompressed cache, (2) compressed cache, and + // then (3) file. If found, inserts into the cache(s) that were searched + // unsuccessfully (e.g., if found in file, will add to both uncompressed and + // compressed caches if they're enabled). + // + // @param block_entry value is set to the uncompressed block if found. If + // in uncompressed block cache, also sets cache_handle to reference that + // block. + static Status MaybeLoadDataBlockToCache( + Rep* rep, const ReadOptions& ro, const BlockHandle& handle, + Slice compression_dict, CachableEntry* block_entry); // For the following two functions: // if `no_io == true`, we will not try to read filter/index from sst file diff --git a/table/format.cc b/table/format.cc index 8b558df0d..cfc3f0377 100644 --- a/table/format.cc +++ b/table/format.cc @@ -59,6 +59,9 @@ Status BlockHandle::DecodeFrom(Slice* input) { GetVarint64(input, &size_)) { return Status::OK(); } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; return Status::Corruption("bad block handle"); } } diff --git a/table/format.h b/table/format.h index b885103ad..0d7cec5e7 100644 --- a/table/format.h +++ b/table/format.h @@ -65,8 +65,8 @@ class BlockHandle { enum { kMaxEncodedLength = 10 + 10 }; private: - uint64_t offset_ = 0; - uint64_t size_ = 0; + uint64_t offset_; + uint64_t size_; static const BlockHandle kNullBlockHandle; }; @@ -242,6 +242,9 @@ extern Status UncompressBlockContentsForCompressionType( // Implementation details follow. Clients should ignore, +// TODO(andrewkr): we should prefer one way of representing a null/uninitialized +// BlockHandle. Currently we use zeros for null and use negation-of-zeros for +// uninitialized. inline BlockHandle::BlockHandle() : BlockHandle(~static_cast(0), ~static_cast(0)) { diff --git a/table/table_properties.cc b/table/table_properties.cc index c1b02b16a..98ae1035d 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -47,6 +47,9 @@ namespace { Status SeekToMetaBlock(InternalIterator* meta_iter, const std::string& block_name, bool* is_found, BlockHandle* block_handle = nullptr) { + if (block_handle != nullptr) { + *block_handle = BlockHandle::NullBlockHandle(); + } *is_found = true; meta_iter->Seek(block_name); if (meta_iter->status().ok()) { diff --git a/table/table_test.cc b/table/table_test.cc index ca9892316..a61800933 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1177,23 +1177,29 @@ TEST_F(BlockBasedTableTest, RangeDelBlock) { c.Finish(options, ioptions, table_options, *internal_cmp, &sorted_keys, &kvmap); - std::unique_ptr iter( - c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions())); - ASSERT_EQ(false, iter->Valid()); - iter->SeekToFirst(); - ASSERT_EQ(true, iter->Valid()); - for (int i = 0; i < 2; i++) { - ASSERT_TRUE(iter->Valid()); - ParsedInternalKey parsed_key; - ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key)); - RangeTombstone t(parsed_key, iter->value()); - ASSERT_EQ(t.start_key_, keys[i]); - ASSERT_EQ(t.end_key_, vals[i]); - ASSERT_EQ(t.seq_, i); - iter->Next(); + for (int j = 0; j < 2; ++j) { + std::unique_ptr iter( + c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions())); + if (j > 0) { + // For second iteration, delete the table reader object and verify the + // iterator can still access its metablock's range tombstones. + c.ResetTableReader(); + } + ASSERT_EQ(false, iter->Valid()); + iter->SeekToFirst(); + ASSERT_EQ(true, iter->Valid()); + for (int i = 0; i < 2; i++) { + ASSERT_TRUE(iter->Valid()); + ParsedInternalKey parsed_key; + ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key)); + RangeTombstone t(parsed_key, iter->value()); + ASSERT_EQ(t.start_key_, keys[i]); + ASSERT_EQ(t.end_key_, vals[i]); + ASSERT_EQ(t.seq_, i); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); } - ASSERT_TRUE(!iter->Valid()); - c.ResetTableReader(); } TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) {