From 7a8d7358bb40b13a06c2c6adc62e80295d89ed05 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Fri, 14 Jun 2019 17:37:24 -0700 Subject: [PATCH] Integrate block cache tracer in block based table reader. (#5441) Summary: This PR integrates the block cache tracer into block based table reader. The tracer will write the block cache accesses using the trace_writer. The tracer is null in this PR so that nothing will be logged. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5441 Differential Revision: D15772029 Pulled By: HaoyuHuang fbshipit-source-id: a64adb92642cd23222e0ba8b10d86bf522b42f9b --- table/block_based/block_based_table_reader.cc | 265 ++++++++++++++---- table/block_based/block_based_table_reader.h | 18 ++ tools/block_cache_trace_analyzer.h | 5 +- tools/block_cache_trace_analyzer_test.cc | 5 +- trace_replay/block_cache_tracer.cc | 62 ++-- trace_replay/block_cache_tracer.h | 89 +++++- trace_replay/block_cache_tracer_test.cc | 31 +- 7 files changed, 365 insertions(+), 110 deletions(-) diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 7434188a0..0caea5088 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1877,9 +1877,8 @@ CachableEntry BlockBasedTable::GetFilter( CachableEntry BlockBasedTable::GetFilter( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, const bool is_a_filter_partition, bool no_io, GetContext* get_context, - BlockCacheLookupContext* /*lookup_context*/, + BlockCacheLookupContext* lookup_context, const SliceTransform* prefix_extractor) const { - // TODO(haoyu): Trace filter block access here. // If cache_index_and_filter_blocks is false, filter should be pre-populated. // We will return rep_->filter anyway. rep_->filter can be nullptr if filter // read fails at Open() time. We don't want to reload again since it will @@ -1912,17 +1911,22 @@ CachableEntry BlockBasedTable::GetFilter( GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context); FilterBlockReader* filter = nullptr; + size_t usage = 0; + bool is_cache_hit = false; + bool return_empty_reader = false; if (cache_handle != nullptr) { filter = reinterpret_cast(block_cache->Value(cache_handle)); + usage = filter->ApproximateMemoryUsage(); + is_cache_hit = true; } else if (no_io) { // Do not invoke any io. - return CachableEntry(); + return_empty_reader = true; } else { filter = ReadFilter(prefetch_buffer, filter_blk_handle, is_a_filter_partition, prefix_extractor); if (filter != nullptr) { - size_t usage = filter->ApproximateMemoryUsage(); + usage = filter->ApproximateMemoryUsage(); Status s = block_cache->Insert( key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, rep_->table_options.cache_index_and_filter_blocks_with_high_priority @@ -1934,19 +1938,36 @@ CachableEntry BlockBasedTable::GetFilter( } else { RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); delete filter; - return CachableEntry(); + return_empty_reader = true; } } } + if (block_cache_tracer_ && lookup_context) { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", TraceType::kBlockTraceFilterBlock, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + /*no_insert=*/no_io); + block_cache_tracer_->WriteBlockAccess(access_record, key, + rep_->cf_name_for_tracing(), + /*referenced_key=*/nullptr); + } + + if (return_empty_reader) { + return CachableEntry(); + } return {filter, cache_handle ? block_cache : nullptr, cache_handle, /*own_value=*/false}; } CachableEntry BlockBasedTable::GetUncompressionDict( FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, - BlockCacheLookupContext* /*lookup_context*/) const { - // TODO(haoyu): Trace the access on the uncompression dictionary here. + BlockCacheLookupContext* lookup_context) const { if (!rep_->table_options.cache_index_and_filter_blocks) { // block cache is either disabled or not used for meta-blocks. In either // case, BlockBasedTableReader is the owner of the uncompression dictionary. @@ -1964,9 +1985,13 @@ CachableEntry BlockBasedTable::GetUncompressionDict( GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key, BlockType::kCompressionDictionary, get_context); UncompressionDict* dict = nullptr; + bool is_cache_hit = false; + size_t usage = 0; if (cache_handle != nullptr) { dict = reinterpret_cast( rep_->table_options.block_cache->Value(cache_handle)); + is_cache_hit = true; + usage = dict->ApproximateMemoryUsage(); } else if (no_io) { // Do not invoke any io. } else { @@ -1980,7 +2005,7 @@ CachableEntry BlockBasedTable::GetUncompressionDict( new UncompressionDict(compression_dict_block->data.ToString(), rep_->blocks_definitely_zstd_compressed, rep_->ioptions.statistics)); - const size_t usage = uncompression_dict->ApproximateMemoryUsage(); + usage = uncompression_dict->ApproximateMemoryUsage(); s = rep_->table_options.block_cache->Insert( cache_key, uncompression_dict.get(), usage, &DeleteCachedUncompressionDictEntry, &cache_handle, @@ -2000,6 +2025,20 @@ CachableEntry BlockBasedTable::GetUncompressionDict( } } } + if (block_cache_tracer_ && lookup_context) { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + /*no_insert=*/no_io); + block_cache_tracer_->WriteBlockAccess(access_record, cache_key, + rep_->cf_name_for_tracing(), + /*referenced_key=*/nullptr); + } return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr, cache_handle, false /* own_value */}; } @@ -2116,13 +2155,10 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, - BlockCacheLookupContext* /*lookup_context*/) const { - // TODO(haoyu): Trace data/index/range deletion block access here. + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep_->table_options.block_cache.get(); - // No point to cache compressed blocks if it never goes away Cache* block_cache_compressed = rep_->immortal_table ? nullptr @@ -2136,6 +2172,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice key /* key to the block cache */; Slice ckey /* key to the compressed block cache */; + bool is_cache_hit = false; + bool no_insert = true; if (block_cache != nullptr || block_cache_compressed != nullptr) { // create key for block cache if (block_cache != nullptr) { @@ -2152,10 +2190,15 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, ro, block_entry, uncompression_dict, block_type, get_context); - + if (block_entry->GetValue()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache and + // compressed block cache. + is_cache_hit = true; + } // Can't find the block from the cache. If I/O is allowed, read from the // file. if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { + no_insert = false; Statistics* statistics = rep_->ioptions.statistics; bool do_decompress = block_cache_compressed == nullptr && rep_->blocks_maybe_compressed; @@ -2186,6 +2229,59 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( } } } + + // Fill lookup_context. + if (block_cache_tracer_ && lookup_context) { + size_t usage = 0; + uint64_t nkeys = 0; + if (block_entry->GetValue()) { + // Approximate the number of keys in the block using restarts. + nkeys = rep_->table_options.block_restart_interval * + block_entry->GetValue()->NumRestarts(); + usage = block_entry->GetValue()->ApproximateMemoryUsage(); + } + TraceType trace_block_type = TraceType::kTraceMax; + switch (block_type) { + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; + case BlockType::kData: + trace_block_type = TraceType::kBlockTraceDataBlock; + break; + case BlockType::kRangeDeletion: + trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; + break; + default: + // This cannot happen. + assert(false); + break; + } + if (BlockCacheTraceHelper::ShouldTraceReferencedKey( + trace_block_type, lookup_context->caller)) { + // Defer logging the access to Get() and MultiGet() to trace additional + // information, e.g., the referenced key, + // referenced_key_exist_in_block. + + // Make a copy of the block key here since it will be logged later. + lookup_context->FillLookupContext( + is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys); + } else { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", trace_block_type, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + no_insert); + block_cache_tracer_->WriteBlockAccess(access_record, key, + rep_->cf_name_for_tracing(), + /*referenced_key=*/nullptr); + } + } + assert(s.ok() || block_entry->GetValue() == nullptr); return s; } @@ -2874,11 +2970,15 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); break; } else { + BlockCacheLookupContext lookup_data_block_context{ + BlockCacheLookupCaller::kUserGet}; + bool does_referenced_key_exist = false; DataBlockIter biter; + uint64_t referenced_data_size = 0; NewDataBlockIterator( read_options, iiter->value(), &biter, BlockType::kData, /*key_includes_seq=*/true, - /*index_key_is_full=*/true, get_context, &lookup_context, + /*index_key_is_full=*/true, get_context, &lookup_data_block_context, /*s=*/Status(), /*prefetch_buffer*/ nullptr); if (read_options.read_tier == kBlockCacheTier && @@ -2902,25 +3002,47 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break // from the top level for-loop. - break; - } - - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { - done = true; - break; + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + done = true; + break; + } } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), key); } - s = biter.status(); } + if (done) { // Avoid the extra Next which is expensive in two-level indexes break; @@ -2992,14 +3114,18 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, bool done = false; for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { bool reusing_block = true; + uint64_t referenced_data_size = 0; + bool does_referenced_key_exist = false; + BlockCacheLookupContext lookup_data_block_context( + BlockCacheLookupCaller::kUserMGet); if (iiter->value().offset() != offset) { offset = iiter->value().offset(); biter.Invalidate(Status::OK()); NewDataBlockIterator( read_options, iiter->value(), &biter, BlockType::kData, /*key_includes_seq=*/false, - /*index_key_is_full=*/true, get_context, &lookup_context, - Status(), nullptr); + /*index_key_is_full=*/true, get_context, + &lookup_data_block_context, Status(), nullptr); reusing_block = false; } if (read_options.read_tier == kBlockCacheTier && @@ -3021,38 +3147,59 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, // the end of the block, i.e. cannot be in the following blocks // either. In this case, the seek_key cannot be found, so we break // from the top level for-loop. - break; - } - - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - Cleanable dummy; - Cleanable* value_pinner = nullptr; - - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } - if (biter.IsValuePinned()) { - if (reusing_block) { - Cache* block_cache = rep_->table_options.block_cache.get(); - assert(biter.cache_handle() != nullptr); - block_cache->Ref(biter.cache_handle()); - dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, - biter.cache_handle()); - value_pinner = &dummy; - } else { - value_pinner = &biter; + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + Cleanable dummy; + Cleanable* value_pinner = nullptr; + + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + if (biter.IsValuePinned()) { + if (reusing_block) { + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(biter.cache_handle() != nullptr); + block_cache->Ref(biter.cache_handle()); + dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, + biter.cache_handle()); + value_pinner = &dummy; + } else { + value_pinner = &biter; + } } - } - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, value_pinner)) { - done = true; - break; + if (!get_context->SaveValue(parsed_key, biter.value(), &matched, + value_pinner)) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + done = true; + break; + } } + s = biter.status(); + } + // Write the block cache access. + if (block_cache_tracer_) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), key); } - s = biter.status(); if (done) { // Avoid the extra Next which is expensive in two-level indexes break; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 223746b3a..17c4e7238 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -17,6 +17,7 @@ #include #include "db/range_tombstone_fragmenter.h" +#include "file/filename.h" #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/persistent_cache.h" @@ -571,6 +572,23 @@ struct BlockBasedTable::Rep { ? kDisableGlobalSequenceNumber : global_seqno; } + + uint64_t cf_id_for_tracing() const { + return table_properties ? table_properties->column_family_id + : rocksdb::TablePropertiesCollectorFactory:: + Context::kUnknownColumnFamily; + } + + Slice cf_name_for_tracing() const { + return table_properties ? table_properties->column_family_name + : BlockCacheTraceHelper::kUnknownColumnFamilyName; + } + + uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } + + uint64_t sst_number_for_tracing() const { + return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; + } }; // Iterates over the contents of BlockBasedTable. diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index 9dde8a939..51bb1ec79 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -35,10 +35,11 @@ struct BlockAccessInfo { block_size = access.block_size; caller_num_access_map[access.caller]++; num_accesses++; - if (ShouldTraceReferencedKey(access)) { + if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type, + access.caller)) { num_keys = access.num_keys_in_block; - if (access.is_referenced_key_exist_in_block == Boolean::kTrue) { + if (access.referenced_key_exist_in_block == Boolean::kTrue) { key_num_access_map[access.referenced_key]++; num_referenced_key_exist_in_block++; } else { diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index 96f52c1ec..a75804492 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -89,9 +89,10 @@ class BlockCacheTracerTest : public testing::Test { // The writer should only write these fields for data blocks and the // caller is either GET or MGET. record.referenced_key = kRefKeyPrefix + std::to_string(key_id); - record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; - ASSERT_OK(writer->WriteBlockAccess(record)); + ASSERT_OK(writer->WriteBlockAccess( + record, record.block_key, record.cf_name, record.referenced_key)); } } diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index 565511e5a..f733bc900 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -15,13 +15,6 @@ namespace rocksdb { namespace { const unsigned int kCharSize = 1; -} // namespace - -bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) { - return (record.block_type == TraceType::kBlockTraceDataBlock) && - (record.caller == BlockCacheLookupCaller::kUserGet || - record.caller == BlockCacheLookupCaller::kUserMGet); -} bool ShouldTrace(const BlockCacheTraceRecord& record, const TraceOptions& trace_options) { @@ -34,6 +27,17 @@ bool ShouldTrace(const BlockCacheTraceRecord& record, const uint64_t hash = GetSliceNPHash64(Slice(record.block_key)); return hash % trace_options.sampling_frequency == 0; } +} // namespace + +const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName = + "UnknownColumnFamily"; + +bool BlockCacheTraceHelper::ShouldTraceReferencedKey( + TraceType block_type, BlockCacheLookupCaller caller) { + return (block_type == TraceType::kBlockTraceDataBlock) && + (caller == BlockCacheLookupCaller::kUserGet || + caller == BlockCacheLookupCaller::kUserMGet); +} BlockCacheTraceWriter::BlockCacheTraceWriter( Env* env, const TraceOptions& trace_options, @@ -43,7 +47,8 @@ BlockCacheTraceWriter::BlockCacheTraceWriter( trace_writer_(std::move(trace_writer)) {} Status BlockCacheTraceWriter::WriteBlockAccess( - const BlockCacheTraceRecord& record) { + const BlockCacheTraceRecord& record, const Slice& block_key, + const Slice& cf_name, const Slice& referenced_key) { uint64_t trace_file_size = trace_writer_->GetFileSize(); if (trace_file_size > trace_options_.max_trace_file_size) { return Status::OK(); @@ -51,19 +56,21 @@ Status BlockCacheTraceWriter::WriteBlockAccess( Trace trace; trace.ts = record.access_timestamp; trace.type = record.block_type; - PutLengthPrefixedSlice(&trace.payload, record.block_key); + PutLengthPrefixedSlice(&trace.payload, block_key); PutFixed64(&trace.payload, record.block_size); - PutFixed32(&trace.payload, record.cf_id); - PutLengthPrefixedSlice(&trace.payload, record.cf_name); + PutFixed64(&trace.payload, record.cf_id); + PutLengthPrefixedSlice(&trace.payload, cf_name); PutFixed32(&trace.payload, record.level); - PutFixed32(&trace.payload, record.sst_fd_number); + PutFixed64(&trace.payload, record.sst_fd_number); trace.payload.push_back(record.caller); trace.payload.push_back(record.is_cache_hit); trace.payload.push_back(record.no_insert); - if (ShouldTraceReferencedKey(record)) { - PutLengthPrefixedSlice(&trace.payload, record.referenced_key); + if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type, + record.caller)) { + PutLengthPrefixedSlice(&trace.payload, referenced_key); + PutFixed64(&trace.payload, record.referenced_data_size); PutFixed64(&trace.payload, record.num_keys_in_block); - trace.payload.push_back(record.is_referenced_key_exist_in_block); + trace.payload.push_back(record.referenced_key_exist_in_block); } std::string encoded_trace; TracerHelper::EncodeTrace(trace, &encoded_trace); @@ -143,6 +150,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { record->access_timestamp = trace.ts; record->block_type = trace.type; Slice enc_slice = Slice(trace.payload); + Slice block_key; if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) { return Status::Incomplete( @@ -153,7 +161,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { return Status::Incomplete( "Incomplete access record: Failed to read block size."); } - if (!GetFixed32(&enc_slice, &record->cf_id)) { + if (!GetFixed64(&enc_slice, &record->cf_id)) { return Status::Incomplete( "Incomplete access record: Failed to read column family ID."); } @@ -167,7 +175,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { return Status::Incomplete( "Incomplete access record: Failed to read level."); } - if (!GetFixed32(&enc_slice, &record->sst_fd_number)) { + if (!GetFixed64(&enc_slice, &record->sst_fd_number)) { return Status::Incomplete( "Incomplete access record: Failed to read SST file number."); } @@ -190,13 +198,18 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { record->no_insert = static_cast(enc_slice[0]); enc_slice.remove_prefix(kCharSize); - if (ShouldTraceReferencedKey(*record)) { + if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type, + record->caller)) { Slice referenced_key; if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) { return Status::Incomplete( "Incomplete access record: Failed to read the referenced key."); } record->referenced_key = referenced_key.ToString(); + if (!GetFixed64(&enc_slice, &record->referenced_data_size)) { + return Status::Incomplete( + "Incomplete access record: Failed to read the referenced data size."); + } if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) { return Status::Incomplete( "Incomplete access record: Failed to read the number of keys in the " @@ -205,10 +218,9 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) { if (enc_slice.empty()) { return Status::Incomplete( "Incomplete access record: Failed to read " - "is_referenced_key_exist_in_block."); + "referenced_key_exist_in_block."); } - record->is_referenced_key_exist_in_block = - static_cast(enc_slice[0]); + record->referenced_key_exist_in_block = static_cast(enc_slice[0]); } return Status::OK(); } @@ -239,7 +251,10 @@ void BlockCacheTracer::EndTrace() { writer_.store(nullptr); } -Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) { +Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, + const Slice& cf_name, + const Slice& referenced_key) { if (!writer_.load() || !ShouldTrace(record, trace_options_)) { return Status::OK(); } @@ -247,7 +262,8 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) { if (!writer_.load()) { return Status::OK(); } - return writer_.load()->WriteBlockAccess(record); + return writer_.load()->WriteBlockAccess(record, block_key, cf_name, + referenced_key); } } // namespace rocksdb diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 320e6d67b..bf8813311 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -49,28 +49,80 @@ struct BlockCacheLookupContext { BlockCacheLookupContext(const BlockCacheLookupCaller& _caller) : caller(_caller) {} const BlockCacheLookupCaller caller; + // These are populated when we perform lookup/insert on block cache. The block + // cache tracer uses these inforation when logging the block access at + // BlockBasedTable::GET and BlockBasedTable::MultiGet. + bool is_cache_hit = false; + bool no_insert = false; + TraceType block_type = TraceType::kTraceMax; + uint64_t block_size = 0; + std::string block_key; + uint64_t num_keys_in_block = 0; + + void FillLookupContext(bool _is_cache_hit, bool _no_insert, + TraceType _block_type, uint64_t _block_size, + const std::string& _block_key, + uint64_t _num_keys_in_block) { + is_cache_hit = _is_cache_hit; + no_insert = _no_insert; + block_type = _block_type; + block_size = _block_size; + block_key = _block_key; + num_keys_in_block = _num_keys_in_block; + } }; enum Boolean : char { kTrue = 1, kFalse = 0 }; struct BlockCacheTraceRecord { // Required fields for all accesses. - uint64_t access_timestamp; + uint64_t access_timestamp = 0; std::string block_key; - TraceType block_type; - uint64_t block_size; - uint32_t cf_id; + TraceType block_type = TraceType::kTraceMax; + uint64_t block_size = 0; + uint64_t cf_id = 0; std::string cf_name; - uint32_t level; - uint32_t sst_fd_number; - BlockCacheLookupCaller caller; - Boolean is_cache_hit; - Boolean no_insert; + uint32_t level = 0; + uint64_t sst_fd_number = 0; + BlockCacheLookupCaller caller = + BlockCacheLookupCaller::kMaxBlockCacheLookupCaller; + Boolean is_cache_hit = Boolean::kFalse; + Boolean no_insert = Boolean::kFalse; // Required fields for data block and user Get/Multi-Get only. std::string referenced_key; + uint64_t referenced_data_size = 0; uint64_t num_keys_in_block = 0; - Boolean is_referenced_key_exist_in_block = Boolean::kFalse; + Boolean referenced_key_exist_in_block = Boolean::kFalse; + + BlockCacheTraceRecord() {} + + BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key, + TraceType _block_type, uint64_t _block_size, + uint64_t _cf_id, std::string _cf_name, uint32_t _level, + uint64_t _sst_fd_number, BlockCacheLookupCaller _caller, + bool _is_cache_hit, bool _no_insert, + std::string _referenced_key = "", + uint64_t _referenced_data_size = 0, + uint64_t _num_keys_in_block = 0, + bool _referenced_key_exist_in_block = false) + : access_timestamp(_access_timestamp), + block_key(_block_key), + block_type(_block_type), + block_size(_block_size), + cf_id(_cf_id), + cf_name(_cf_name), + level(_level), + sst_fd_number(_sst_fd_number), + caller(_caller), + is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse), + no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse), + referenced_key(_referenced_key), + referenced_data_size(_referenced_data_size), + num_keys_in_block(_num_keys_in_block), + referenced_key_exist_in_block( + _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) { + } }; struct BlockCacheTraceHeader { @@ -79,7 +131,13 @@ struct BlockCacheTraceHeader { uint32_t rocksdb_minor_version; }; -bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record); +class BlockCacheTraceHelper { + public: + static bool ShouldTraceReferencedKey(TraceType block_type, + BlockCacheLookupCaller caller); + + static const std::string kUnknownColumnFamilyName; +}; // BlockCacheTraceWriter captures all RocksDB block cache accesses using a // user-provided TraceWriter. Every RocksDB operation is written as a single @@ -96,7 +154,10 @@ class BlockCacheTraceWriter { BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete; BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete; - Status WriteBlockAccess(const BlockCacheTraceRecord& record); + // Pass Slice references to avoid copy. + Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key); // Write a trace header at the beginning, typically on initiating a trace, // with some metadata like a magic number and RocksDB version. @@ -148,7 +209,9 @@ class BlockCacheTracer { // Stop writing block cache accesses to the trace_writer. void EndTrace(); - Status WriteBlockAccess(const BlockCacheTraceRecord& record); + Status WriteBlockAccess(const BlockCacheTraceRecord& record, + const Slice& block_key, const Slice& cf_name, + const Slice& referenced_key); private: TraceOptions trace_options_; diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc index 0f3ca67c6..95fe16b8c 100644 --- a/trace_replay/block_cache_tracer_test.cc +++ b/trace_replay/block_cache_tracer_test.cc @@ -20,6 +20,7 @@ const uint32_t kLevel = 1; const uint64_t kSSTFDNumber = 100; const std::string kRefKeyPrefix = "test-get-"; const uint64_t kNumKeysInBlock = 1024; +const uint64_t kReferencedDataSize = 10; } // namespace class BlockCacheTracerTest : public testing::Test { @@ -61,7 +62,7 @@ class BlockCacheTracerTest : public testing::Test { BlockCacheTraceRecord record; record.block_type = block_type; record.block_size = kBlockSize + key_id; - record.block_key = kBlockKeyPrefix + std::to_string(key_id); + record.block_key = (kBlockKeyPrefix + std::to_string(key_id)); record.access_timestamp = env_->NowMicros(); record.cf_id = kCFId; record.cf_name = kDefaultColumnFamilyName; @@ -73,10 +74,12 @@ class BlockCacheTracerTest : public testing::Test { // Provide these fields for all block types. // The writer should only write these fields for data blocks and the // caller is either GET or MGET. - record.referenced_key = kRefKeyPrefix + std::to_string(key_id); - record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_key = (kRefKeyPrefix + std::to_string(key_id)); + record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; - ASSERT_OK(writer->WriteBlockAccess(record)); + record.referenced_data_size = kReferencedDataSize + key_id; + ASSERT_OK(writer->WriteBlockAccess( + record, record.block_key, record.cf_name, record.referenced_key)); } } @@ -95,7 +98,7 @@ class BlockCacheTracerTest : public testing::Test { record.is_cache_hit = Boolean::kFalse; record.no_insert = Boolean::kFalse; record.referenced_key = kRefKeyPrefix + std::to_string(key_id); - record.is_referenced_key_exist_in_block = Boolean::kTrue; + record.referenced_key_exist_in_block = Boolean::kTrue; record.num_keys_in_block = kNumKeysInBlock; return record; } @@ -122,13 +125,15 @@ class BlockCacheTracerTest : public testing::Test { record.caller == BlockCacheLookupCaller::kUserMGet)) { ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id), record.referenced_key); - ASSERT_EQ(Boolean::kTrue, record.is_referenced_key_exist_in_block); + ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block); ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block); + ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size); continue; } ASSERT_EQ("", record.referenced_key); - ASSERT_EQ(Boolean::kFalse, record.is_referenced_key_exist_in_block); + ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block); ASSERT_EQ(0, record.num_keys_in_block); + ASSERT_EQ(0, record.referenced_data_size); } } @@ -147,7 +152,8 @@ TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) { BlockCacheTracer writer; // The record should be written to the trace_file since StartTrace is not // called. - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); ASSERT_OK(env_->FileExists(trace_file_path_)); } { @@ -170,7 +176,8 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) { &trace_writer)); BlockCacheTracer writer; ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); ASSERT_OK(env_->FileExists(trace_file_path_)); } { @@ -197,11 +204,13 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) { &trace_writer)); BlockCacheTracer writer; ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer))); - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); writer.EndTrace(); // Write the record again. This time the record should not be written since // EndTrace is called. - ASSERT_OK(writer.WriteBlockAccess(record)); + ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name, + record.referenced_key)); ASSERT_OK(env_->FileExists(trace_file_path_)); } {