diff --git a/tools/block_cache_trace_analyzer.cc b/tools/block_cache_trace_analyzer.cc index 732094bf2..78753a216 100644 --- a/tools/block_cache_trace_analyzer.cc +++ b/tools/block_cache_trace_analyzer.cc @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "monitoring/histogram.h" #include "util/gflags_compat.h" @@ -42,12 +41,70 @@ DEFINE_bool(print_data_block_access_count_stats, false, DEFINE_int32(cache_sim_warmup_seconds, 0, "The number of seconds to warmup simulated caches. The hit/miss " "counters are reset after the warmup completes."); -DEFINE_string(output_miss_ratio_curve_path, "", - "The output file to save the computed miss ratios. File format: " - "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses"); +DEFINE_string( + block_cache_analysis_result_dir, "", + "The directory that saves block cache analysis results. It contains 1) a " + "mrc file that saves the computed miss ratios for simulated caches. Its " + "format is " + "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses. 2) Several " + "\"label_access_timeline\" files that contain number of accesses per " + "second grouped by the label. File format: " + "time,label_1_access_per_second,label_2_access_per_second,...,label_N_" + "access_per_second where N is the number of unique labels found in the " + "trace. 3) Several \"label_reuse_distance\" and \"label_reuse_interval\" " + "csv files that contain the reuse distance/interval grouped by label. File " + "format: bucket,label_1,label_2,...,label_N. The first N buckets are " + "absolute values. The second N buckets are percentage values."); +DEFINE_string( + timeline_labels, "", + "Group the number of accesses per block per second using these labels. " + "Possible labels are a combination of the following: cf (column family), " + "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" " + "means the number of acccess per second is grouped by unique pairs of " + "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per " + "second across all possible labels."); +DEFINE_string(reuse_distance_labels, "", + "Group the reuse distance of a block using these labels. Reuse " + "distance is defined as the cumulated size of unique blocks read " + "between two consecutive accesses on the same block."); +DEFINE_string( + reuse_distance_buckets, "", + "Group blocks by their reuse distances given these buckets. For " + "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M " + "and 1G, respectively. The last bucket contains the number of blocks with " + "reuse distance larger than 1G. "); +DEFINE_string( + reuse_interval_labels, "", + "Group the reuse interval of a block using these labels. Reuse " + "interval is defined as the time between two consecutive accesses " + "on the same block."); +DEFINE_string( + reuse_interval_buckets, "", + "Group blocks by their reuse interval given these buckets. For " + "example, if 'reuse_distance_buckets' is '1,10,100', we will " + "create four buckets. The first three buckets contain the number of " + "blocks with reuse interval less than 1 second, between 1 second and 10 " + "seconds, between 10 seconds and 100 seconds, respectively. The last " + "bucket contains the number of blocks with reuse interval longer than 100 " + "seconds."); namespace rocksdb { namespace { + +const std::string kMissRatioCurveFileName = "mrc"; +const std::string kGroupbyBlock = "block"; +const std::string kGroupbyColumnFamily = "cf"; +const std::string kGroupbySSTFile = "sst"; +const std::string kGroupbyBlockType = "bt"; +const std::string kGroupbyCaller = "caller"; +const std::string kGroupbyLevel = "level"; +const std::string kGroupbyAll = "all"; +const std::set kGroupbyLabels{ + kGroupbyBlock, kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel, + kGroupbyBlockType, kGroupbyCaller, kGroupbyAll}; + std::string block_type_to_string(TraceType type) { switch (type) { case kBlockTraceFilterBlock: @@ -146,8 +203,9 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { trace_start_time_ = access.access_timestamp; } // access.access_timestamp is in microseconds. - if (!warmup_complete_ && trace_start_time_ + warmup_seconds_ * 1000000 <= - access.access_timestamp) { + if (!warmup_complete_ && + trace_start_time_ + warmup_seconds_ * kMicrosInSecond <= + access.access_timestamp) { for (auto& sim_cache : sim_caches_) { sim_cache->reset_counter(); } @@ -162,14 +220,16 @@ void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) { } } -void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const { +void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const { if (!cache_simulator_) { return; } - if (output_miss_ratio_curve_path_.empty()) { + if (output_dir_.empty()) { return; } - std::ofstream out(output_miss_ratio_curve_path_); + const std::string output_miss_ratio_curve_path = + output_dir_ + "/" + kMissRatioCurveFileName; + std::ofstream out(output_miss_ratio_curve_path); if (!out.is_open()) { return; } @@ -203,14 +263,345 @@ void BlockCacheTraceAnalyzer::PrintMissRatioCurves() const { out.close(); } +std::set BlockCacheTraceAnalyzer::ParseLabelStr( + const std::string& label_str) const { + std::stringstream ss(label_str); + std::set labels; + // label_str is in the form of "label1_label2_label3", e.g., cf_bt. + while (ss.good()) { + std::string label_name; + getline(ss, label_name, '_'); + if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) { + // Unknown label name. + fprintf(stderr, "Unknown label name %s, label string %s\n", + label_name.c_str(), label_str.c_str()); + return {}; + } + labels.insert(label_name); + } + return labels; +} + +std::string BlockCacheTraceAnalyzer::BuildLabel( + const std::set& labels, const std::string& cf_name, + uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller, + const std::string& block_key) const { + std::map label_value_map; + label_value_map[kGroupbyAll] = kGroupbyAll; + label_value_map[kGroupbyLevel] = std::to_string(level); + label_value_map[kGroupbyCaller] = caller_to_string(caller); + label_value_map[kGroupbySSTFile] = std::to_string(fd); + label_value_map[kGroupbyBlockType] = block_type_to_string(type); + label_value_map[kGroupbyColumnFamily] = cf_name; + label_value_map[kGroupbyBlock] = block_key; + // Concatenate the label values. + std::string label; + for (auto const& l : labels) { + label += label_value_map[l]; + label += "-"; + } + if (!label.empty()) { + label.pop_back(); + } + return label; +} + +void BlockCacheTraceAnalyzer::WriteAccessTimeline( + const std::string& label_str) const { + std::set labels = ParseLabelStr(label_str); + uint64_t start_time = port::kMaxUint64; + uint64_t end_time = 0; + std::map> label_access_timeline; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + for (auto const& timeline : + block_access_info.second.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + const std::string& block_key = block_access_info.first; + const std::string label = + BuildLabel(labels, cf_name, fd, level, type, caller, block_key); + for (auto const& naccess : timeline.second) { + const uint64_t timestamp = naccess.first; + const uint64_t num = naccess.second; + label_access_timeline[label][timestamp] += num; + start_time = std::min(start_time, timestamp); + end_time = std::max(end_time, timestamp); + } + } + } + } + } + } + + // We have label_access_timeline now. Write them into a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_access_timeline"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("time"); + for (auto const& label : label_access_timeline) { + header += ","; + header += label.first; + } + out << header << std::endl; + std::string row; + for (uint64_t now = start_time; now <= end_time; now++) { + row = std::to_string(now); + for (auto const& label : label_access_timeline) { + auto it = label.second.find(now); + row += ","; + if (it != label.second.end()) { + row += std::to_string(it->second); + } else { + row += "0"; + } + } + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::WriteReuseDistance( + const std::string& label_str, + const std::set& distance_buckets) const { + std::set labels = ParseLabelStr(label_str); + std::map> label_distance_num_reuses; + uint64_t total_num_reuses = 0; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + const std::string& block_key = block_access_info.first; + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_key); + if (label_distance_num_reuses.find(label) == + label_distance_num_reuses.end()) { + // The first time we encounter this label. + for (auto const& distance_bucket : distance_buckets) { + label_distance_num_reuses[label][distance_bucket] = 0; + } + } + for (auto const& reuse_distance : + block_access_info.second.reuse_distance_count) { + label_distance_num_reuses[label] + .upper_bound(reuse_distance.first) + ->second += reuse_distance.second; + total_num_reuses += reuse_distance.second; + } + } + } + } + } + + // We have label_naccesses and label_distance_num_reuses now. Write them into + // a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_reuse_distance"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_distance_num_reuses) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + // Absolute values. + for (auto const& bucket : distance_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_distance_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(it->second); + } + out << row << std::endl; + } + // Percentage values. + for (auto const& bucket : distance_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_distance_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, total_num_reuses)); + } + out << row << std::endl; + } + out.close(); +} + +void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats( + const std::string& label, const std::set& time_buckets, + const std::map timeline, + std::map>* label_time_num_reuses, + uint64_t* total_num_reuses) const { + assert(label_time_num_reuses); + assert(total_num_reuses); + if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) { + // The first time we encounter this label. + for (auto const& time_bucket : time_buckets) { + (*label_time_num_reuses)[label][time_bucket] = 0; + } + } + auto it = timeline.begin(); + const uint64_t prev_timestamp = it->first; + const uint64_t prev_num = it->second; + it++; + // Reused within one second. + if (prev_num > 1) { + (*label_time_num_reuses)[label].upper_bound(1)->second += prev_num - 1; + *total_num_reuses += prev_num - 1; + } + while (it != timeline.end()) { + const uint64_t timestamp = it->first; + const uint64_t num = it->second; + const uint64_t reuse_interval = timestamp - prev_timestamp; + (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += num; + *total_num_reuses += num; + } +} + +void BlockCacheTraceAnalyzer::WriteReuseInterval( + const std::string& label_str, + const std::set& time_buckets) const { + std::set labels = ParseLabelStr(label_str); + std::map> label_time_num_reuses; + uint64_t total_num_reuses = 0; + for (auto const& cf_aggregates : cf_aggregates_map_) { + // Stats per column family. + const std::string& cf_name = cf_aggregates.first; + for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + // Stats per SST file. + const uint64_t fd = file_aggregates.first; + const uint32_t level = file_aggregates.second.level; + for (auto const& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + // Stats per block type. + const TraceType type = block_type_aggregates.first; + for (auto const& block_access_info : + block_type_aggregates.second.block_access_info_map) { + // Stats per block. + const std::string& block_key = block_access_info.first; + if (labels.find(kGroupbyCaller) != labels.end()) { + for (auto const& timeline : + block_access_info.second.caller_num_accesses_timeline) { + const TableReaderCaller caller = timeline.first; + const std::string label = BuildLabel(labels, cf_name, fd, level, + type, caller, block_key); + UpdateReuseIntervalStats(label, time_buckets, timeline.second, + &label_time_num_reuses, + &total_num_reuses); + } + continue; + } + // Does not group by caller so we need to flatten the access timeline. + const std::string label = BuildLabel( + labels, cf_name, fd, level, type, + TableReaderCaller::kMaxBlockCacheLookupCaller, block_key); + std::map timeline; + for (auto const& caller_timeline : + block_access_info.second.caller_num_accesses_timeline) { + for (auto const& time_naccess : caller_timeline.second) { + timeline[time_naccess.first] += time_naccess.second; + } + } + UpdateReuseIntervalStats(label, time_buckets, timeline, + &label_time_num_reuses, &total_num_reuses); + } + } + } + } + + // We have label_naccesses and label_interval_num_reuses now. Write them into + // a file. + const std::string output_path = + output_dir_ + "/" + label_str + "_reuse_interval"; + std::ofstream out(output_path); + if (!out.is_open()) { + return; + } + std::string header("bucket"); + for (auto const& label_it : label_time_num_reuses) { + header += ","; + header += label_it.first; + } + out << header << std::endl; + // Absolute values. + for (auto const& bucket : time_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_time_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(it->second); + } + out << row << std::endl; + } + // Percentage values. + for (auto const& bucket : time_buckets) { + std::string row(std::to_string(bucket)); + for (auto const& label_it : label_time_num_reuses) { + auto const& it = label_it.second.find(bucket); + assert(it != label_it.second.end()); + row += ","; + row += std::to_string(percent(it->second, total_num_reuses)); + } + out << row << std::endl; + } + out.close(); +} + BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer( - const std::string& trace_file_path, - const std::string& output_miss_ratio_curve_path, + const std::string& trace_file_path, const std::string& output_dir, std::unique_ptr&& cache_simulator) - : trace_file_path_(trace_file_path), - output_miss_ratio_curve_path_(output_miss_ratio_curve_path), - cache_simulator_(std::move(cache_simulator)) { - env_ = rocksdb::Env::Default(); + : env_(rocksdb::Env::Default()), + trace_file_path_(trace_file_path), + output_dir_(output_dir), + cache_simulator_(std::move(cache_simulator)) {} + +void BlockCacheTraceAnalyzer::ComputeReuseDistance( + BlockAccessInfo* info) const { + assert(info); + if (info->num_accesses == 0) { + return; + } + uint64_t reuse_distance = 0; + for (auto const& block_key : info->unique_blocks_since_last_access) { + auto const& it = block_info_map_.find(block_key); + // This block must exist. + assert(it != block_info_map_.end()); + reuse_distance += it->second->block_size; + } + info->reuse_distance_count[reuse_distance] += 1; + // We clear this hash set since this is the second access on this block. + info->unique_blocks_since_last_access.clear(); } void BlockCacheTraceAnalyzer::RecordAccess( @@ -223,7 +614,23 @@ void BlockCacheTraceAnalyzer::RecordAccess( file_aggr.block_type_aggregates_map[access.block_type]; BlockAccessInfo& block_access_info = block_type_aggr.block_access_info_map[access.block_key]; + ComputeReuseDistance(&block_access_info); block_access_info.AddAccess(access); + block_info_map_[access.block_key] = &block_access_info; + + // Add this block to all existing blocks. + for (auto& cf_aggregates : cf_aggregates_map_) { + for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) { + for (auto& block_type_aggregates : + file_aggregates.second.block_type_aggregates_map) { + for (auto& existing_block : + block_type_aggregates.second.block_access_info_map) { + existing_block.second.unique_blocks_since_last_access.insert( + access.block_key); + } + } + } + } } Status BlockCacheTraceAnalyzer::Analyze() { @@ -659,6 +1066,18 @@ std::vector parse_cache_config_file( return configs; } +std::set parse_buckets(const std::string& bucket_str) { + std::set buckets; + std::stringstream ss(bucket_str); + while (ss.good()) { + std::string bucket; + getline(ss, bucket, ','); + buckets.insert(ParseUint64(bucket)); + } + buckets.insert(port::kMaxUint64); + return buckets; +} + int block_cache_trace_analyzer_tool(int argc, char** argv) { ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_block_cache_trace_path.empty()) { @@ -678,7 +1097,7 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { warmup_seconds, downsample_ratio, cache_configs)); } BlockCacheTraceAnalyzer analyzer(FLAGS_block_cache_trace_path, - FLAGS_output_miss_ratio_curve_path, + FLAGS_block_cache_analysis_result_dir, std::move(cache_simulator)); Status s = analyzer.Analyze(); if (!s.IsIncomplete()) { @@ -701,7 +1120,38 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { analyzer.PrintDataBlockAccessStats(); } print_break_lines(/*num_break_lines=*/3); - analyzer.PrintMissRatioCurves(); + analyzer.WriteMissRatioCurves(); + + if (!FLAGS_timeline_labels.empty()) { + std::stringstream ss(FLAGS_timeline_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteAccessTimeline(label); + } + } + + if (!FLAGS_reuse_distance_labels.empty() && + !FLAGS_reuse_distance_buckets.empty()) { + std::set buckets = parse_buckets(FLAGS_reuse_distance_buckets); + std::stringstream ss(FLAGS_reuse_distance_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseDistance(label, buckets); + } + } + + if (!FLAGS_reuse_interval_labels.empty() && + !FLAGS_reuse_interval_buckets.empty()) { + std::set buckets = parse_buckets(FLAGS_reuse_interval_buckets); + std::stringstream ss(FLAGS_reuse_interval_labels); + while (ss.good()) { + std::string label; + getline(ss, label, ','); + analyzer.WriteReuseInterval(label, buckets); + } + } return 0; } diff --git a/tools/block_cache_trace_analyzer.h b/tools/block_cache_trace_analyzer.h index c953ecf21..21a99f7db 100644 --- a/tools/block_cache_trace_analyzer.h +++ b/tools/block_cache_trace_analyzer.h @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include "rocksdb/env.h" @@ -14,6 +15,8 @@ namespace rocksdb { +const uint64_t kMicrosInSecond = 1000000; + class BlockCacheTraceAnalyzer; // A cache configuration provided by user. @@ -73,6 +76,14 @@ struct BlockAccessInfo { non_exist_key_num_access_map; // for keys do not exist in this block. uint64_t num_referenced_key_exist_in_block = 0; std::map caller_num_access_map; + // caller:timestamp:number_of_accesses. The granularity of the timestamp is + // seconds. + std::map> + caller_num_accesses_timeline; + // Unique blocks since the last access. + std::set unique_blocks_since_last_access; + // Number of reuses grouped by reuse distance. + std::map reuse_distance_count; void AddAccess(const BlockCacheTraceRecord& access) { if (first_access_time == 0) { @@ -82,10 +93,13 @@ struct BlockAccessInfo { block_size = access.block_size; caller_num_access_map[access.caller]++; num_accesses++; + // access.access_timestamp is in microsecond. + const uint64_t timestamp_in_seconds = + access.access_timestamp / kMicrosInSecond; + caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1; if (BlockCacheTraceHelper::ShouldTraceReferencedKey(access.block_type, access.caller)) { num_keys = access.num_keys_in_block; - if (access.referenced_key_exist_in_block == Boolean::kTrue) { key_num_access_map[access.referenced_key]++; num_referenced_key_exist_in_block++; @@ -115,8 +129,7 @@ struct ColumnFamilyAccessInfoAggregate { class BlockCacheTraceAnalyzer { public: BlockCacheTraceAnalyzer( - const std::string& trace_file_path, - const std::string& output_miss_ratio_curve_path, + const std::string& trace_file_path, const std::string& output_dir, std::unique_ptr&& cache_simulator); ~BlockCacheTraceAnalyzer() = default; // No copy and move. @@ -165,7 +178,24 @@ class BlockCacheTraceAnalyzer { // accesses on keys exist in a data block and its break down by column family. void PrintDataBlockAccessStats() const; - void PrintMissRatioCurves() const; + // Write miss ratio curves of simulated cache configurations into a csv file + // saved in 'output_dir'. + void WriteMissRatioCurves() const; + + // Write the access timeline into a csv file saved in 'output_dir'. + void WriteAccessTimeline(const std::string& label) const; + + // Write the reuse distance into a csv file saved in 'output_dir'. Reuse + // distance is defined as the cumulated size of unique blocks read between two + // consective accesses on the same block. + void WriteReuseDistance(const std::string& label_str, + const std::set& distance_buckets) const; + + // Write the reuse interval into a csv file saved in 'output_dir'. Reuse + // interval is defined as the time between two consecutive accesses on the + // same block.. + void WriteReuseInterval(const std::string& label_str, + const std::set& time_buckets) const; const std::map& TEST_cf_aggregates_map() const { @@ -173,15 +203,33 @@ class BlockCacheTraceAnalyzer { } private: + std::set ParseLabelStr(const std::string& label_str) const; + + std::string BuildLabel(const std::set& labels, + const std::string& cf_name, uint64_t fd, + uint32_t level, TraceType type, + TableReaderCaller caller, + const std::string& block_key) const; + + void ComputeReuseDistance(BlockAccessInfo* info) const; + void RecordAccess(const BlockCacheTraceRecord& access); + void UpdateReuseIntervalStats( + const std::string& label, const std::set& time_buckets, + const std::map timeline, + std::map>* + label_time_num_reuses, + uint64_t* total_num_reuses) const; + rocksdb::Env* env_; const std::string trace_file_path_; - const std::string output_miss_ratio_curve_path_; + const std::string output_dir_; BlockCacheTraceHeader header_; std::unique_ptr cache_simulator_; std::map cf_aggregates_map_; + std::map block_info_map_; }; int block_cache_trace_analyzer_tool(int argc, char** argv); diff --git a/tools/block_cache_trace_analyzer_test.cc b/tools/block_cache_trace_analyzer_test.cc index c361ba054..80734565a 100644 --- a/tools/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_trace_analyzer_test.cc @@ -49,7 +49,13 @@ class BlockCacheTracerTest : public testing::Test { EXPECT_OK(env_->CreateDir(test_path_)); trace_file_path_ = test_path_ + "/block_cache_trace"; block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config"; - output_miss_ratio_curve_path_ = test_path_ + "/out_miss_ratio_curve"; + timeline_labels_ = + "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller"; + reuse_distance_labels_ = + "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller"; + reuse_distance_buckets_ = "1,1K,1M,1G"; + reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt"; + reuse_interval_buckets_ = "1,10,100,1000"; } ~BlockCacheTracerTest() override { @@ -85,11 +91,12 @@ class BlockCacheTracerTest : public testing::Test { assert(writer); for (uint32_t i = 0; i < nblocks; i++) { uint32_t key_id = from_key_id + i; + uint32_t timestamp = (key_id + 1) * kMicrosInSecond; BlockCacheTraceRecord record; record.block_type = block_type; record.block_size = kBlockSize + key_id; record.block_key = kBlockKeyPrefix + std::to_string(key_id); - record.access_timestamp = env_->NowMicros(); + record.access_timestamp = timestamp; record.cf_id = kCFId; record.cf_name = kDefaultColumnFamilyName; record.caller = GetCaller(key_id); @@ -146,11 +153,17 @@ class BlockCacheTracerTest : public testing::Test { "./block_cache_trace_analyzer", "-block_cache_trace_path=" + trace_file_path_, "-block_cache_sim_config_path=" + block_cache_sim_config_path_, - "-output_miss_ratio_curve_path=" + output_miss_ratio_curve_path_, + "-block_cache_analysis_result_dir=" + test_path_, "-print_block_size_stats", "-print_access_count_stats", "-print_data_block_access_count_stats", - "-cache_sim_warmup_seconds=0"}; + "-cache_sim_warmup_seconds=0", + "-timeline_labels=" + timeline_labels_, + "-reuse_distance_labels=" + reuse_distance_labels_, + "-reuse_distance_buckets=" + reuse_distance_buckets_, + "-reuse_interval_labels=" + reuse_interval_labels_, + "-reuse_interval_buckets=" + reuse_interval_buckets_, + }; char arg_buffer[kArgBufferSize]; char* argv[kMaxArgCount]; int argc = 0; @@ -168,10 +181,14 @@ class BlockCacheTracerTest : public testing::Test { Env* env_; EnvOptions env_options_; - std::string output_miss_ratio_curve_path_; std::string block_cache_sim_config_path_; std::string trace_file_path_; std::string test_path_; + std::string timeline_labels_; + std::string reuse_distance_labels_; + std::string reuse_distance_buckets_; + std::string reuse_interval_labels_; + std::string reuse_interval_buckets_; }; TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { @@ -199,7 +216,8 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { // Validate the cache miss ratios. const std::vector expected_capacities{1024, 1024 * 1024, 1024 * 1024 * 1024}; - std::ifstream infile(output_miss_ratio_curve_path_); + const std::string mrc_path = test_path_ + "/mrc"; + std::ifstream infile(mrc_path); uint32_t config_index = 0; std::string line; // Read header. @@ -224,8 +242,91 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { } ASSERT_EQ(expected_capacities.size(), config_index); infile.close(); + ASSERT_OK(env_->DeleteFile(mrc_path)); + } + { + // Validate the timeline csv files. + const uint32_t expected_num_lines = 50; + std::stringstream ss(timeline_labels_); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::string timeline_file = + test_path_ + "/" + l + "_access_timeline"; + std::ifstream infile(timeline_file); + std::string line; + uint32_t nlines = 0; + ASSERT_TRUE(getline(infile, line)); + uint64_t expected_time = 1; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + uint32_t naccesses = 0; + std::string substr; + uint32_t time = 0; + while (ss_naccess.good()) { + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (time == 0) { + time = ParseUint32(substr); + continue; + } + naccesses += ParseUint32(substr); + } + nlines++; + ASSERT_EQ(1, naccesses); + ASSERT_EQ(expected_time, time); + expected_time += 1; + } + ASSERT_EQ(expected_num_lines, nlines); + ASSERT_OK(env_->DeleteFile(timeline_file)); + } + } + { + // Validate the reuse_interval and reuse_distance csv files. + std::map test_reuse_csv_files; + test_reuse_csv_files["_reuse_interval"] = reuse_interval_labels_; + test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_; + for (auto const& test : test_reuse_csv_files) { + const std::string& file_suffix = test.first; + const std::string& labels = test.second; + const uint32_t expected_num_rows = 10; + const uint32_t expected_num_rows_absolute_values = 5; + const uint32_t expected_reused_blocks = 0; + std::stringstream ss(labels); + while (ss.good()) { + std::string l; + ASSERT_TRUE(getline(ss, l, ',')); + const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix; + std::ifstream infile(reuse_csv_file); + std::string line; + ASSERT_TRUE(getline(infile, line)); + uint32_t nblocks = 0; + double npercentage = 0; + uint32_t nrows = 0; + while (getline(infile, line)) { + std::stringstream ss_naccess(line); + bool label_read = false; + nrows++; + while (ss_naccess.good()) { + std::string substr; + ASSERT_TRUE(getline(ss_naccess, substr, ',')); + if (!label_read) { + label_read = true; + continue; + } + if (nrows < expected_num_rows_absolute_values) { + nblocks += ParseUint32(substr); + } else { + npercentage += ParseDouble(substr); + } + } + } + ASSERT_EQ(expected_num_rows, nrows); + ASSERT_EQ(expected_reused_blocks, nblocks); + ASSERT_LT(npercentage, 0); + ASSERT_OK(env_->DeleteFile(reuse_csv_file)); + } + } } - ASSERT_OK(env_->DeleteFile(output_miss_ratio_curve_path_)); ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_)); }