Integrate block cache tracer in block based table reader. (#5441)

Summary:
This PR integrates the block cache tracer into block based table reader. The tracer will write the block cache accesses using the trace_writer. The tracer is null in this PR so that nothing will be logged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5441

Differential Revision: D15772029

Pulled By: HaoyuHuang

fbshipit-source-id: a64adb92642cd23222e0ba8b10d86bf522b42f9b
main
haoyuhuang 5 years ago committed by Facebook Github Bot
parent f1219644ec
commit 7a8d7358bb
  1. 265
      table/block_based/block_based_table_reader.cc
  2. 18
      table/block_based/block_based_table_reader.h
  3. 5
      tools/block_cache_trace_analyzer.h
  4. 5
      tools/block_cache_trace_analyzer_test.cc
  5. 62
      trace_replay/block_cache_tracer.cc
  6. 89
      trace_replay/block_cache_tracer.h
  7. 31
      trace_replay/block_cache_tracer_test.cc

@ -1877,9 +1877,8 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
const bool is_a_filter_partition, bool no_io, GetContext* get_context,
BlockCacheLookupContext* /*lookup_context*/,
BlockCacheLookupContext* lookup_context,
const SliceTransform* prefix_extractor) const {
// TODO(haoyu): Trace filter block access here.
// If cache_index_and_filter_blocks is false, filter should be pre-populated.
// We will return rep_->filter anyway. rep_->filter can be nullptr if filter
// read fails at Open() time. We don't want to reload again since it will
@ -1912,17 +1911,22 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
GetEntryFromCache(block_cache, key, BlockType::kFilter, get_context);
FilterBlockReader* filter = nullptr;
size_t usage = 0;
bool is_cache_hit = false;
bool return_empty_reader = false;
if (cache_handle != nullptr) {
filter =
reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
usage = filter->ApproximateMemoryUsage();
is_cache_hit = true;
} else if (no_io) {
// Do not invoke any io.
return CachableEntry<FilterBlockReader>();
return_empty_reader = true;
} else {
filter = ReadFilter(prefetch_buffer, filter_blk_handle,
is_a_filter_partition, prefix_extractor);
if (filter != nullptr) {
size_t usage = filter->ApproximateMemoryUsage();
usage = filter->ApproximateMemoryUsage();
Status s = block_cache->Insert(
key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
rep_->table_options.cache_index_and_filter_blocks_with_high_priority
@ -1934,19 +1938,36 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
} else {
RecordTick(rep_->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
delete filter;
return CachableEntry<FilterBlockReader>();
return_empty_reader = true;
}
}
}
if (block_cache_tracer_ && lookup_context) {
// Avoid making copy of block_key and cf_name when constructing the access
// record.
BlockCacheTraceRecord access_record(
rep_->ioptions.env->NowMicros(),
/*block_key=*/"", TraceType::kBlockTraceFilterBlock,
/*block_size=*/usage, rep_->cf_id_for_tracing(),
/*cf_name=*/"", rep_->level_for_tracing(),
rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
/*no_insert=*/no_io);
block_cache_tracer_->WriteBlockAccess(access_record, key,
rep_->cf_name_for_tracing(),
/*referenced_key=*/nullptr);
}
if (return_empty_reader) {
return CachableEntry<FilterBlockReader>();
}
return {filter, cache_handle ? block_cache : nullptr, cache_handle,
/*own_value=*/false};
}
CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
BlockCacheLookupContext* /*lookup_context*/) const {
// TODO(haoyu): Trace the access on the uncompression dictionary here.
BlockCacheLookupContext* lookup_context) const {
if (!rep_->table_options.cache_index_and_filter_blocks) {
// block cache is either disabled or not used for meta-blocks. In either
// case, BlockBasedTableReader is the owner of the uncompression dictionary.
@ -1964,9 +1985,13 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
GetEntryFromCache(rep_->table_options.block_cache.get(), cache_key,
BlockType::kCompressionDictionary, get_context);
UncompressionDict* dict = nullptr;
bool is_cache_hit = false;
size_t usage = 0;
if (cache_handle != nullptr) {
dict = reinterpret_cast<UncompressionDict*>(
rep_->table_options.block_cache->Value(cache_handle));
is_cache_hit = true;
usage = dict->ApproximateMemoryUsage();
} else if (no_io) {
// Do not invoke any io.
} else {
@ -1980,7 +2005,7 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
new UncompressionDict(compression_dict_block->data.ToString(),
rep_->blocks_definitely_zstd_compressed,
rep_->ioptions.statistics));
const size_t usage = uncompression_dict->ApproximateMemoryUsage();
usage = uncompression_dict->ApproximateMemoryUsage();
s = rep_->table_options.block_cache->Insert(
cache_key, uncompression_dict.get(), usage,
&DeleteCachedUncompressionDictEntry, &cache_handle,
@ -2000,6 +2025,20 @@ CachableEntry<UncompressionDict> BlockBasedTable::GetUncompressionDict(
}
}
}
if (block_cache_tracer_ && lookup_context) {
// Avoid making copy of block_key and cf_name when constructing the access
// record.
BlockCacheTraceRecord access_record(
rep_->ioptions.env->NowMicros(),
/*block_key=*/"", TraceType::kBlockTraceUncompressionDictBlock,
/*block_size=*/usage, rep_->cf_id_for_tracing(),
/*cf_name=*/"", rep_->level_for_tracing(),
rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
/*no_insert=*/no_io);
block_cache_tracer_->WriteBlockAccess(access_record, cache_key,
rep_->cf_name_for_tracing(),
/*referenced_key=*/nullptr);
}
return {dict, cache_handle ? rep_->table_options.block_cache.get() : nullptr,
cache_handle, false /* own_value */};
}
@ -2116,13 +2155,10 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, BlockType block_type,
GetContext* get_context,
BlockCacheLookupContext* /*lookup_context*/) const {
// TODO(haoyu): Trace data/index/range deletion block access here.
GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
assert(block_entry != nullptr);
const bool no_io = (ro.read_tier == kBlockCacheTier);
Cache* block_cache = rep_->table_options.block_cache.get();
// No point to cache compressed blocks if it never goes away
Cache* block_cache_compressed =
rep_->immortal_table ? nullptr
@ -2136,6 +2172,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
Slice key /* key to the block cache */;
Slice ckey /* key to the compressed block cache */;
bool is_cache_hit = false;
bool no_insert = true;
if (block_cache != nullptr || block_cache_compressed != nullptr) {
// create key for block cache
if (block_cache != nullptr) {
@ -2152,10 +2190,15 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
ro, block_entry, uncompression_dict, block_type,
get_context);
if (block_entry->GetValue()) {
// TODO(haoyu): Differentiate cache hit on uncompressed block cache and
// compressed block cache.
is_cache_hit = true;
}
// Can't find the block from the cache. If I/O is allowed, read from the
// file.
if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
no_insert = false;
Statistics* statistics = rep_->ioptions.statistics;
bool do_decompress =
block_cache_compressed == nullptr && rep_->blocks_maybe_compressed;
@ -2186,6 +2229,59 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
}
}
}
// Fill lookup_context.
if (block_cache_tracer_ && lookup_context) {
size_t usage = 0;
uint64_t nkeys = 0;
if (block_entry->GetValue()) {
// Approximate the number of keys in the block using restarts.
nkeys = rep_->table_options.block_restart_interval *
block_entry->GetValue()->NumRestarts();
usage = block_entry->GetValue()->ApproximateMemoryUsage();
}
TraceType trace_block_type = TraceType::kTraceMax;
switch (block_type) {
case BlockType::kIndex:
trace_block_type = TraceType::kBlockTraceIndexBlock;
break;
case BlockType::kData:
trace_block_type = TraceType::kBlockTraceDataBlock;
break;
case BlockType::kRangeDeletion:
trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
break;
default:
// This cannot happen.
assert(false);
break;
}
if (BlockCacheTraceHelper::ShouldTraceReferencedKey(
trace_block_type, lookup_context->caller)) {
// Defer logging the access to Get() and MultiGet() to trace additional
// information, e.g., the referenced key,
// referenced_key_exist_in_block.
// Make a copy of the block key here since it will be logged later.
lookup_context->FillLookupContext(
is_cache_hit, no_insert, trace_block_type,
/*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
} else {
// Avoid making copy of block_key and cf_name when constructing the access
// record.
BlockCacheTraceRecord access_record(
rep_->ioptions.env->NowMicros(),
/*block_key=*/"", trace_block_type,
/*block_size=*/usage, rep_->cf_id_for_tracing(),
/*cf_name=*/"", rep_->level_for_tracing(),
rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
no_insert);
block_cache_tracer_->WriteBlockAccess(access_record, key,
rep_->cf_name_for_tracing(),
/*referenced_key=*/nullptr);
}
}
assert(s.ok() || block_entry->GetValue() == nullptr);
return s;
}
@ -2874,11 +2970,15 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
break;
} else {
BlockCacheLookupContext lookup_data_block_context{
BlockCacheLookupCaller::kUserGet};
bool does_referenced_key_exist = false;
DataBlockIter biter;
uint64_t referenced_data_size = 0;
NewDataBlockIterator<DataBlockIter>(
read_options, iiter->value(), &biter, BlockType::kData,
/*key_includes_seq=*/true,
/*index_key_is_full=*/true, get_context, &lookup_context,
/*index_key_is_full=*/true, get_context, &lookup_data_block_context,
/*s=*/Status(), /*prefetch_buffer*/ nullptr);
if (read_options.read_tier == kBlockCacheTier &&
@ -2902,25 +3002,47 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
// the end of the block, i.e. cannot be in the following blocks
// either. In this case, the seek_key cannot be found, so we break
// from the top level for-loop.
break;
}
// Call the *saver function on each entry/block until it returns false
for (; biter.Valid(); biter.Next()) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(biter.key(), &parsed_key)) {
s = Status::Corruption(Slice());
}
done = true;
} else {
// Call the *saver function on each entry/block until it returns false
for (; biter.Valid(); biter.Next()) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(biter.key(), &parsed_key)) {
s = Status::Corruption(Slice());
}
if (!get_context->SaveValue(
parsed_key, biter.value(), &matched,
biter.IsValuePinned() ? &biter : nullptr)) {
done = true;
break;
if (!get_context->SaveValue(
parsed_key, biter.value(), &matched,
biter.IsValuePinned() ? &biter : nullptr)) {
does_referenced_key_exist = true;
referenced_data_size = biter.key().size() + biter.value().size();
done = true;
break;
}
}
s = biter.status();
}
// Write the block cache access record.
if (block_cache_tracer_) {
// Avoid making copy of block_key, cf_name, and referenced_key when
// constructing the access record.
BlockCacheTraceRecord access_record(
rep_->ioptions.env->NowMicros(),
/*block_key=*/"", lookup_data_block_context.block_type,
lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
/*cf_name=*/"", rep_->level_for_tracing(),
rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
lookup_data_block_context.is_cache_hit,
lookup_data_block_context.no_insert,
/*referenced_key=*/"", referenced_data_size,
lookup_data_block_context.num_keys_in_block,
does_referenced_key_exist);
block_cache_tracer_->WriteBlockAccess(
access_record, lookup_data_block_context.block_key,
rep_->cf_name_for_tracing(), key);
}
s = biter.status();
}
if (done) {
// Avoid the extra Next which is expensive in two-level indexes
break;
@ -2992,14 +3114,18 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
bool done = false;
for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
bool reusing_block = true;
uint64_t referenced_data_size = 0;
bool does_referenced_key_exist = false;
BlockCacheLookupContext lookup_data_block_context(
BlockCacheLookupCaller::kUserMGet);
if (iiter->value().offset() != offset) {
offset = iiter->value().offset();
biter.Invalidate(Status::OK());
NewDataBlockIterator<DataBlockIter>(
read_options, iiter->value(), &biter, BlockType::kData,
/*key_includes_seq=*/false,
/*index_key_is_full=*/true, get_context, &lookup_context,
Status(), nullptr);
/*index_key_is_full=*/true, get_context,
&lookup_data_block_context, Status(), nullptr);
reusing_block = false;
}
if (read_options.read_tier == kBlockCacheTier &&
@ -3021,38 +3147,59 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
// the end of the block, i.e. cannot be in the following blocks
// either. In this case, the seek_key cannot be found, so we break
// from the top level for-loop.
break;
}
// Call the *saver function on each entry/block until it returns false
for (; biter.Valid(); biter.Next()) {
ParsedInternalKey parsed_key;
Cleanable dummy;
Cleanable* value_pinner = nullptr;
if (!ParseInternalKey(biter.key(), &parsed_key)) {
s = Status::Corruption(Slice());
}
if (biter.IsValuePinned()) {
if (reusing_block) {
Cache* block_cache = rep_->table_options.block_cache.get();
assert(biter.cache_handle() != nullptr);
block_cache->Ref(biter.cache_handle());
dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
biter.cache_handle());
value_pinner = &dummy;
} else {
value_pinner = &biter;
done = true;
} else {
// Call the *saver function on each entry/block until it returns false
for (; biter.Valid(); biter.Next()) {
ParsedInternalKey parsed_key;
Cleanable dummy;
Cleanable* value_pinner = nullptr;
if (!ParseInternalKey(biter.key(), &parsed_key)) {
s = Status::Corruption(Slice());
}
if (biter.IsValuePinned()) {
if (reusing_block) {
Cache* block_cache = rep_->table_options.block_cache.get();
assert(biter.cache_handle() != nullptr);
block_cache->Ref(biter.cache_handle());
dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
biter.cache_handle());
value_pinner = &dummy;
} else {
value_pinner = &biter;
}
}
}
if (!get_context->SaveValue(
parsed_key, biter.value(), &matched, value_pinner)) {
done = true;
break;
if (!get_context->SaveValue(parsed_key, biter.value(), &matched,
value_pinner)) {
does_referenced_key_exist = true;
referenced_data_size = biter.key().size() + biter.value().size();
done = true;
break;
}
}
s = biter.status();
}
// Write the block cache access.
if (block_cache_tracer_) {
// Avoid making copy of block_key, cf_name, and referenced_key when
// constructing the access record.
BlockCacheTraceRecord access_record(
rep_->ioptions.env->NowMicros(),
/*block_key=*/"", lookup_data_block_context.block_type,
lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
/*cf_name=*/"", rep_->level_for_tracing(),
rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
lookup_data_block_context.is_cache_hit,
lookup_data_block_context.no_insert,
/*referenced_key=*/"", referenced_data_size,
lookup_data_block_context.num_keys_in_block,
does_referenced_key_exist);
block_cache_tracer_->WriteBlockAccess(
access_record, lookup_data_block_context.block_key,
rep_->cf_name_for_tracing(), key);
}
s = biter.status();
if (done) {
// Avoid the extra Next which is expensive in two-level indexes
break;

@ -17,6 +17,7 @@
#include <vector>
#include "db/range_tombstone_fragmenter.h"
#include "file/filename.h"
#include "options/cf_options.h"
#include "rocksdb/options.h"
#include "rocksdb/persistent_cache.h"
@ -571,6 +572,23 @@ struct BlockBasedTable::Rep {
? kDisableGlobalSequenceNumber
: global_seqno;
}
uint64_t cf_id_for_tracing() const {
return table_properties ? table_properties->column_family_id
: rocksdb::TablePropertiesCollectorFactory::
Context::kUnknownColumnFamily;
}
Slice cf_name_for_tracing() const {
return table_properties ? table_properties->column_family_name
: BlockCacheTraceHelper::kUnknownColumnFamilyName;
}
uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
uint64_t sst_number_for_tracing() const {
return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
}
};
// Iterates over the contents of BlockBasedTable.

@ -35,10 +35,11 @@ struct BlockAccessInfo {
block_size = access.block_size;
caller_num_access_map[access.caller]++;
num_accesses++;
if (ShouldTraceReferencedKey(access)) {
if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type,
access.caller)) {
num_keys = access.num_keys_in_block;
if (access.is_referenced_key_exist_in_block == Boolean::kTrue) {
if (access.referenced_key_exist_in_block == Boolean::kTrue) {
key_num_access_map[access.referenced_key]++;
num_referenced_key_exist_in_block++;
} else {

@ -89,9 +89,10 @@ class BlockCacheTracerTest : public testing::Test {
// The writer should only write these fields for data blocks and the
// caller is either GET or MGET.
record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
record.is_referenced_key_exist_in_block = Boolean::kTrue;
record.referenced_key_exist_in_block = Boolean::kTrue;
record.num_keys_in_block = kNumKeysInBlock;
ASSERT_OK(writer->WriteBlockAccess(record));
ASSERT_OK(writer->WriteBlockAccess(
record, record.block_key, record.cf_name, record.referenced_key));
}
}

@ -15,13 +15,6 @@ namespace rocksdb {
namespace {
const unsigned int kCharSize = 1;
} // namespace
bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record) {
return (record.block_type == TraceType::kBlockTraceDataBlock) &&
(record.caller == BlockCacheLookupCaller::kUserGet ||
record.caller == BlockCacheLookupCaller::kUserMGet);
}
bool ShouldTrace(const BlockCacheTraceRecord& record,
const TraceOptions& trace_options) {
@ -34,6 +27,17 @@ bool ShouldTrace(const BlockCacheTraceRecord& record,
const uint64_t hash = GetSliceNPHash64(Slice(record.block_key));
return hash % trace_options.sampling_frequency == 0;
}
} // namespace
const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
"UnknownColumnFamily";
bool BlockCacheTraceHelper::ShouldTraceReferencedKey(
TraceType block_type, BlockCacheLookupCaller caller) {
return (block_type == TraceType::kBlockTraceDataBlock) &&
(caller == BlockCacheLookupCaller::kUserGet ||
caller == BlockCacheLookupCaller::kUserMGet);
}
BlockCacheTraceWriter::BlockCacheTraceWriter(
Env* env, const TraceOptions& trace_options,
@ -43,7 +47,8 @@ BlockCacheTraceWriter::BlockCacheTraceWriter(
trace_writer_(std::move(trace_writer)) {}
Status BlockCacheTraceWriter::WriteBlockAccess(
const BlockCacheTraceRecord& record) {
const BlockCacheTraceRecord& record, const Slice& block_key,
const Slice& cf_name, const Slice& referenced_key) {
uint64_t trace_file_size = trace_writer_->GetFileSize();
if (trace_file_size > trace_options_.max_trace_file_size) {
return Status::OK();
@ -51,19 +56,21 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
Trace trace;
trace.ts = record.access_timestamp;
trace.type = record.block_type;
PutLengthPrefixedSlice(&trace.payload, record.block_key);
PutLengthPrefixedSlice(&trace.payload, block_key);
PutFixed64(&trace.payload, record.block_size);
PutFixed32(&trace.payload, record.cf_id);
PutLengthPrefixedSlice(&trace.payload, record.cf_name);
PutFixed64(&trace.payload, record.cf_id);
PutLengthPrefixedSlice(&trace.payload, cf_name);
PutFixed32(&trace.payload, record.level);
PutFixed32(&trace.payload, record.sst_fd_number);
PutFixed64(&trace.payload, record.sst_fd_number);
trace.payload.push_back(record.caller);
trace.payload.push_back(record.is_cache_hit);
trace.payload.push_back(record.no_insert);
if (ShouldTraceReferencedKey(record)) {
PutLengthPrefixedSlice(&trace.payload, record.referenced_key);
if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record.block_type,
record.caller)) {
PutLengthPrefixedSlice(&trace.payload, referenced_key);
PutFixed64(&trace.payload, record.referenced_data_size);
PutFixed64(&trace.payload, record.num_keys_in_block);
trace.payload.push_back(record.is_referenced_key_exist_in_block);
trace.payload.push_back(record.referenced_key_exist_in_block);
}
std::string encoded_trace;
TracerHelper::EncodeTrace(trace, &encoded_trace);
@ -143,6 +150,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
record->access_timestamp = trace.ts;
record->block_type = trace.type;
Slice enc_slice = Slice(trace.payload);
Slice block_key;
if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) {
return Status::Incomplete(
@ -153,7 +161,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
return Status::Incomplete(
"Incomplete access record: Failed to read block size.");
}
if (!GetFixed32(&enc_slice, &record->cf_id)) {
if (!GetFixed64(&enc_slice, &record->cf_id)) {
return Status::Incomplete(
"Incomplete access record: Failed to read column family ID.");
}
@ -167,7 +175,7 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
return Status::Incomplete(
"Incomplete access record: Failed to read level.");
}
if (!GetFixed32(&enc_slice, &record->sst_fd_number)) {
if (!GetFixed64(&enc_slice, &record->sst_fd_number)) {
return Status::Incomplete(
"Incomplete access record: Failed to read SST file number.");
}
@ -190,13 +198,18 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
record->no_insert = static_cast<Boolean>(enc_slice[0]);
enc_slice.remove_prefix(kCharSize);
if (ShouldTraceReferencedKey(*record)) {
if (BlockCacheTraceHelper::ShouldTraceReferencedKey(record->block_type,
record->caller)) {
Slice referenced_key;
if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) {
return Status::Incomplete(
"Incomplete access record: Failed to read the referenced key.");
}
record->referenced_key = referenced_key.ToString();
if (!GetFixed64(&enc_slice, &record->referenced_data_size)) {
return Status::Incomplete(
"Incomplete access record: Failed to read the referenced data size.");
}
if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) {
return Status::Incomplete(
"Incomplete access record: Failed to read the number of keys in the "
@ -205,10 +218,9 @@ Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
if (enc_slice.empty()) {
return Status::Incomplete(
"Incomplete access record: Failed to read "
"is_referenced_key_exist_in_block.");
"referenced_key_exist_in_block.");
}
record->is_referenced_key_exist_in_block =
static_cast<Boolean>(enc_slice[0]);
record->referenced_key_exist_in_block = static_cast<Boolean>(enc_slice[0]);
}
return Status::OK();
}
@ -239,7 +251,10 @@ void BlockCacheTracer::EndTrace() {
writer_.store(nullptr);
}
Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) {
Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record,
const Slice& block_key,
const Slice& cf_name,
const Slice& referenced_key) {
if (!writer_.load() || !ShouldTrace(record, trace_options_)) {
return Status::OK();
}
@ -247,7 +262,8 @@ Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record) {
if (!writer_.load()) {
return Status::OK();
}
return writer_.load()->WriteBlockAccess(record);
return writer_.load()->WriteBlockAccess(record, block_key, cf_name,
referenced_key);
}
} // namespace rocksdb

@ -49,28 +49,80 @@ struct BlockCacheLookupContext {
BlockCacheLookupContext(const BlockCacheLookupCaller& _caller)
: caller(_caller) {}
const BlockCacheLookupCaller caller;
// These are populated when we perform lookup/insert on block cache. The block
// cache tracer uses these inforation when logging the block access at
// BlockBasedTable::GET and BlockBasedTable::MultiGet.
bool is_cache_hit = false;
bool no_insert = false;
TraceType block_type = TraceType::kTraceMax;
uint64_t block_size = 0;
std::string block_key;
uint64_t num_keys_in_block = 0;
void FillLookupContext(bool _is_cache_hit, bool _no_insert,
TraceType _block_type, uint64_t _block_size,
const std::string& _block_key,
uint64_t _num_keys_in_block) {
is_cache_hit = _is_cache_hit;
no_insert = _no_insert;
block_type = _block_type;
block_size = _block_size;
block_key = _block_key;
num_keys_in_block = _num_keys_in_block;
}
};
enum Boolean : char { kTrue = 1, kFalse = 0 };
struct BlockCacheTraceRecord {
// Required fields for all accesses.
uint64_t access_timestamp;
uint64_t access_timestamp = 0;
std::string block_key;
TraceType block_type;
uint64_t block_size;
uint32_t cf_id;
TraceType block_type = TraceType::kTraceMax;
uint64_t block_size = 0;
uint64_t cf_id = 0;
std::string cf_name;
uint32_t level;
uint32_t sst_fd_number;
BlockCacheLookupCaller caller;
Boolean is_cache_hit;
Boolean no_insert;
uint32_t level = 0;
uint64_t sst_fd_number = 0;
BlockCacheLookupCaller caller =
BlockCacheLookupCaller::kMaxBlockCacheLookupCaller;
Boolean is_cache_hit = Boolean::kFalse;
Boolean no_insert = Boolean::kFalse;
// Required fields for data block and user Get/Multi-Get only.
std::string referenced_key;
uint64_t referenced_data_size = 0;
uint64_t num_keys_in_block = 0;
Boolean is_referenced_key_exist_in_block = Boolean::kFalse;
Boolean referenced_key_exist_in_block = Boolean::kFalse;
BlockCacheTraceRecord() {}
BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
TraceType _block_type, uint64_t _block_size,
uint64_t _cf_id, std::string _cf_name, uint32_t _level,
uint64_t _sst_fd_number, BlockCacheLookupCaller _caller,
bool _is_cache_hit, bool _no_insert,
std::string _referenced_key = "",
uint64_t _referenced_data_size = 0,
uint64_t _num_keys_in_block = 0,
bool _referenced_key_exist_in_block = false)
: access_timestamp(_access_timestamp),
block_key(_block_key),
block_type(_block_type),
block_size(_block_size),
cf_id(_cf_id),
cf_name(_cf_name),
level(_level),
sst_fd_number(_sst_fd_number),
caller(_caller),
is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
referenced_key(_referenced_key),
referenced_data_size(_referenced_data_size),
num_keys_in_block(_num_keys_in_block),
referenced_key_exist_in_block(
_referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) {
}
};
struct BlockCacheTraceHeader {
@ -79,7 +131,13 @@ struct BlockCacheTraceHeader {
uint32_t rocksdb_minor_version;
};
bool ShouldTraceReferencedKey(const BlockCacheTraceRecord& record);
class BlockCacheTraceHelper {
public:
static bool ShouldTraceReferencedKey(TraceType block_type,
BlockCacheLookupCaller caller);
static const std::string kUnknownColumnFamilyName;
};
// BlockCacheTraceWriter captures all RocksDB block cache accesses using a
// user-provided TraceWriter. Every RocksDB operation is written as a single
@ -96,7 +154,10 @@ class BlockCacheTraceWriter {
BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
Status WriteBlockAccess(const BlockCacheTraceRecord& record);
// Pass Slice references to avoid copy.
Status WriteBlockAccess(const BlockCacheTraceRecord& record,
const Slice& block_key, const Slice& cf_name,
const Slice& referenced_key);
// Write a trace header at the beginning, typically on initiating a trace,
// with some metadata like a magic number and RocksDB version.
@ -148,7 +209,9 @@ class BlockCacheTracer {
// Stop writing block cache accesses to the trace_writer.
void EndTrace();
Status WriteBlockAccess(const BlockCacheTraceRecord& record);
Status WriteBlockAccess(const BlockCacheTraceRecord& record,
const Slice& block_key, const Slice& cf_name,
const Slice& referenced_key);
private:
TraceOptions trace_options_;

@ -20,6 +20,7 @@ const uint32_t kLevel = 1;
const uint64_t kSSTFDNumber = 100;
const std::string kRefKeyPrefix = "test-get-";
const uint64_t kNumKeysInBlock = 1024;
const uint64_t kReferencedDataSize = 10;
} // namespace
class BlockCacheTracerTest : public testing::Test {
@ -61,7 +62,7 @@ class BlockCacheTracerTest : public testing::Test {
BlockCacheTraceRecord record;
record.block_type = block_type;
record.block_size = kBlockSize + key_id;
record.block_key = kBlockKeyPrefix + std::to_string(key_id);
record.block_key = (kBlockKeyPrefix + std::to_string(key_id));
record.access_timestamp = env_->NowMicros();
record.cf_id = kCFId;
record.cf_name = kDefaultColumnFamilyName;
@ -73,10 +74,12 @@ class BlockCacheTracerTest : public testing::Test {
// Provide these fields for all block types.
// The writer should only write these fields for data blocks and the
// caller is either GET or MGET.
record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
record.is_referenced_key_exist_in_block = Boolean::kTrue;
record.referenced_key = (kRefKeyPrefix + std::to_string(key_id));
record.referenced_key_exist_in_block = Boolean::kTrue;
record.num_keys_in_block = kNumKeysInBlock;
ASSERT_OK(writer->WriteBlockAccess(record));
record.referenced_data_size = kReferencedDataSize + key_id;
ASSERT_OK(writer->WriteBlockAccess(
record, record.block_key, record.cf_name, record.referenced_key));
}
}
@ -95,7 +98,7 @@ class BlockCacheTracerTest : public testing::Test {
record.is_cache_hit = Boolean::kFalse;
record.no_insert = Boolean::kFalse;
record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
record.is_referenced_key_exist_in_block = Boolean::kTrue;
record.referenced_key_exist_in_block = Boolean::kTrue;
record.num_keys_in_block = kNumKeysInBlock;
return record;
}
@ -122,13 +125,15 @@ class BlockCacheTracerTest : public testing::Test {
record.caller == BlockCacheLookupCaller::kUserMGet)) {
ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
record.referenced_key);
ASSERT_EQ(Boolean::kTrue, record.is_referenced_key_exist_in_block);
ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block);
ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block);
ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size);
continue;
}
ASSERT_EQ("", record.referenced_key);
ASSERT_EQ(Boolean::kFalse, record.is_referenced_key_exist_in_block);
ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block);
ASSERT_EQ(0, record.num_keys_in_block);
ASSERT_EQ(0, record.referenced_data_size);
}
}
@ -147,7 +152,8 @@ TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) {
BlockCacheTracer writer;
// The record should be written to the trace_file since StartTrace is not
// called.
ASSERT_OK(writer.WriteBlockAccess(record));
ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
record.referenced_key));
ASSERT_OK(env_->FileExists(trace_file_path_));
}
{
@ -170,7 +176,8 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) {
&trace_writer));
BlockCacheTracer writer;
ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
ASSERT_OK(writer.WriteBlockAccess(record));
ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
record.referenced_key));
ASSERT_OK(env_->FileExists(trace_file_path_));
}
{
@ -197,11 +204,13 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
&trace_writer));
BlockCacheTracer writer;
ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
ASSERT_OK(writer.WriteBlockAccess(record));
ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
record.referenced_key));
writer.EndTrace();
// Write the record again. This time the record should not be written since
// EndTrace is called.
ASSERT_OK(writer.WriteBlockAccess(record));
ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
record.referenced_key));
ASSERT_OK(env_->FileExists(trace_file_path_));
}
{

Loading…
Cancel
Save