From b632ed0c67015f5df6dd9cc5173b38b58d162ceb Mon Sep 17 00:00:00 2001 From: Zhichao Cao Date: Thu, 7 Oct 2021 14:57:02 -0700 Subject: [PATCH] Add file temperature related counter and bytes stats to and io_stats (#8710) Summary: For tiered storage project, we need to know the block read count and read bytes of files with different temperature. Add FileIOByTemperature to IOStatsContext and collect the bytes read and read count from different temperature files through the RandomAccessFileReader. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8710 Test Plan: make check, add the testing cases Reviewed By: siying Differential Revision: D30582400 Pulled By: zhichao-cao fbshipit-source-id: d83173de594374fc8404af5ce93a6a9be72c7141 --- db/db_test2.cc | 37 +++++++++++++++++++++++++ db/table_cache.cc | 37 ++++++++++++------------- db/table_cache.h | 6 +++-- db/version_builder.cc | 3 ++- file/random_access_file_reader.cc | 45 +++++++++++++++++++++++++++++++ file/random_access_file_reader.h | 7 +++-- include/rocksdb/iostats_context.h | 28 +++++++++++++++++++ monitoring/iostats_context.cc | 8 +++++- monitoring/perf_context_imp.h | 27 +++++++++---------- 9 files changed, 160 insertions(+), 38 deletions(-) diff --git a/db/db_test2.cc b/db/db_test2.cc index 7d10ec673..d2addb4eb 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -17,6 +17,7 @@ #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/trace_record.h" #include "rocksdb/trace_record_result.h" @@ -6522,6 +6523,9 @@ TEST_F(DBTest2, BottommostTemperature) { ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + ColumnFamilyMetaData metadata; db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(1, metadata.file_count); @@ -6530,11 +6534,31 @@ TEST_F(DBTest2, BottommostTemperature) { ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); // non-bottommost file still has unknown temperature ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); @@ -6583,6 +6607,8 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) { ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kHot); ASSERT_EQ(size, 0); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); for (int i = 0; i < kTriggerNum; i++) { ASSERT_OK(Put("foo", "bar")); @@ -6600,6 +6626,17 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) { ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); diff --git a/db/table_cache.cc b/db/table_cache.cc index 3acd80627..fe549329a 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -103,7 +103,7 @@ Status TableCache::GetTableReader( std::unique_ptr* table_reader, const SliceTransform* prefix_extractor, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { std::string fname = TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); std::unique_ptr file; @@ -132,7 +132,8 @@ Status TableCache::GetTableReader( new RandomAccessFileReader( std::move(file), fname, ioptions_.clock, io_tracer_, record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners)); + file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, + file_temperature)); s = ioptions_.table_factory->NewTableReader( ro, TableReaderOptions( @@ -154,15 +155,13 @@ void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) { cache_->Erase(key); } -Status TableCache::FindTable(const ReadOptions& ro, - const FileOptions& file_options, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, Cache::Handle** handle, - const SliceTransform* prefix_extractor, - const bool no_io, bool record_read_stats, - HistogramImpl* file_read_hist, bool skip_filters, - int level, bool prefetch_index_and_filter_in_cache, - size_t max_file_size_for_l0_meta_pin) { +Status TableCache::FindTable( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, + Cache::Handle** handle, const SliceTransform* prefix_extractor, + const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); @@ -186,7 +185,7 @@ Status TableCache::FindTable(const ReadOptions& ro, ro, file_options, internal_comparator, fd, false /* sequential mode */, record_read_stats, file_read_hist, &table_reader, prefix_extractor, skip_filters, level, prefetch_index_and_filter_in_cache, - max_file_size_for_l0_meta_pin); + max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.stats, NO_FILE_ERRORS); @@ -231,7 +230,7 @@ InternalIterator* TableCache::NewIterator( options.read_tier == kBlockCacheTier /* no_io */, !for_compaction /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin); + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { table_reader = GetTableReaderFromHandle(handle); } @@ -431,7 +430,7 @@ Status TableCache::Get(const ReadOptions& options, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin); + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { t = GetTableReaderFromHandle(handle); } @@ -531,10 +530,12 @@ Status TableCache::MultiGet(const ReadOptions& options, // found in the row cache and thus the range may now be empty if (s.ok() && !table_range.empty()) { if (t == nullptr) { - s = FindTable( - options, file_options_, internal_comparator, fd, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, level); + s = FindTable(options, file_options_, internal_comparator, fd, &handle, + prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); if (s.ok()) { t = GetTableReaderFromHandle(handle); diff --git a/db/table_cache.h b/db/table_cache.h index 2138eb4a2..e8144dcd0 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -140,7 +140,8 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, - size_t max_file_size_for_l0_meta_pin = 0); + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); @@ -206,7 +207,8 @@ class TableCache { const SliceTransform* prefix_extractor = nullptr, bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, - size_t max_file_size_for_l0_meta_pin = 0); + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Create a key prefix for looking up the row cache. The prefix is of the // format row_cache_id + fd_number + seq_no. Later, the user key can be diff --git a/db/version_builder.cc b/db/version_builder.cc index a25f444fc..c90187a18 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1017,7 +1017,8 @@ class VersionBuilder::Rep { &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, - prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin); + prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, + file_meta->temperature); if (file_meta->table_reader_handle != nullptr) { // Load table_reader file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 6378155e3..4ffcce25c 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -22,6 +22,46 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { +inline void IOStatsAddBytesByTemperature(Temperature file_temperature, + size_t value) { + if (file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, value); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, value); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, value); + break; + default: + break; + } +} + +inline void IOStatsAddCountByTemperature(Temperature file_temperature, + size_t value) { + if (file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, value); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, value); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, value); + break; + default: + break; + } +} + IOStatus RandomAccessFileReader::Create( const std::shared_ptr& fs, const std::string& fname, const FileOptions& file_opts, @@ -182,6 +222,8 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, *result = Slice(res_scratch, io_s.ok() ? pos : 0); } IOSTATS_ADD(bytes_read, result->size()); + IOStatsAddBytesByTemperature(file_temperature_, result->size()); + IOStatsAddCountByTemperature(file_temperature_, 1); SetPerfLevel(prev_perf_level); } if (stats_ != nullptr && file_read_hist_ != nullptr) { @@ -347,6 +389,9 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, } #endif // ROCKSDB_LITE IOSTATS_ADD(bytes_read, read_reqs[i].result.size()); + IOStatsAddBytesByTemperature(file_temperature_, + read_reqs[i].result.size()); + IOStatsAddCountByTemperature(file_temperature_, 1); } SetPerfLevel(prev_perf_level); } diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 181f4dd02..7706ca4af 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -73,6 +73,7 @@ class RandomAccessFileReader { HistogramImpl* file_read_hist_; RateLimiter* rate_limiter_; std::vector> listeners_; + Temperature file_temperature_; public: explicit RandomAccessFileReader( @@ -82,7 +83,8 @@ class RandomAccessFileReader { Statistics* stats = nullptr, uint32_t hist_type = 0, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, - const std::vector>& listeners = {}) + const std::vector>& listeners = {}, + Temperature file_temperature = Temperature::kUnknown) : file_(std::move(raf), io_tracer, _file_name), file_name_(std::move(_file_name)), clock_(clock), @@ -90,7 +92,8 @@ class RandomAccessFileReader { hist_type_(hist_type), file_read_hist_(file_read_hist), rate_limiter_(rate_limiter), - listeners_() { + listeners_(), + file_temperature_(file_temperature) { #ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), [this](const std::shared_ptr& e) { diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index 0f6ab692e..85937187b 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -14,6 +14,32 @@ namespace ROCKSDB_NAMESPACE { +// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each +// item in Temperature class. +struct FileIOByTemperature { + // the number of bytes read to Temperature::kHot file + uint64_t hot_file_bytes_read; + // the number of bytes read to Temperature::kWarm file + uint64_t warm_file_bytes_read; + // the number of bytes read to Temperature::kCold file + uint64_t cold_file_bytes_read; + // total number of reads to Temperature::kHot file + uint64_t hot_file_read_count; + // total number of reads to Temperature::kWarm file + uint64_t warm_file_read_count; + // total number of reads to Temperature::kCold file + uint64_t cold_file_read_count; + // reset all the statistics to 0. + void Reset() { + hot_file_bytes_read = 0; + warm_file_bytes_read = 0; + cold_file_bytes_read = 0; + hot_file_read_count = 0; + warm_file_read_count = 0; + cold_file_read_count = 0; + } +}; + struct IOStatsContext { // reset all io-stats counter to zero void Reset(); @@ -48,6 +74,8 @@ struct IOStatsContext { uint64_t cpu_write_nanos; // CPU time spent in read() and pread() uint64_t cpu_read_nanos; + + FileIOByTemperature file_io_stats_by_temperature; }; // If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global, diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 23bf3a694..b86951b4b 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -39,6 +39,7 @@ void IOStatsContext::Reset() { logger_nanos = 0; cpu_write_nanos = 0; cpu_read_nanos = 0; + file_io_stats_by_temperature.Reset(); #endif //! NIOSTATS_CONTEXT } @@ -66,7 +67,12 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const { IOSTATS_CONTEXT_OUTPUT(logger_nanos); IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos); IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos); - + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count); std::string str = ss.str(); str.erase(str.find_last_not_of(", ") + 1); return str; diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index b7a56adef..d1804067c 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -77,20 +77,19 @@ extern thread_local PerfContext perf_context; } // Increase metric value -#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ - if (perf_level >= PerfLevel::kEnableCount && \ - perf_context.per_level_perf_context_enabled && \ - perf_context.level_to_perf_context) { \ - if ((*(perf_context.level_to_perf_context)).find(level) != \ - (*(perf_context.level_to_perf_context)).end()) { \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - else { \ - PerfContextByLevel empty_context; \ - (*(perf_context.level_to_perf_context))[level] = empty_context; \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - } \ +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ + if (perf_level >= PerfLevel::kEnableCount && \ + perf_context.per_level_perf_context_enabled && \ + perf_context.level_to_perf_context) { \ + if ((*(perf_context.level_to_perf_context)).find(level) != \ + (*(perf_context.level_to_perf_context)).end()) { \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } else { \ + PerfContextByLevel empty_context; \ + (*(perf_context.level_to_perf_context))[level] = empty_context; \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } \ + } #endif