diff --git a/db/db_test2.cc b/db/db_test2.cc index 7d10ec673..d2addb4eb 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -17,6 +17,7 @@ #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/trace_record.h" #include "rocksdb/trace_record_result.h" @@ -6522,6 +6523,9 @@ TEST_F(DBTest2, BottommostTemperature) { ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); + ColumnFamilyMetaData metadata; db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(1, metadata.file_count); @@ -6530,11 +6534,31 @@ TEST_F(DBTest2, BottommostTemperature) { ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); // non-bottommost file still has unknown temperature ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); + db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); @@ -6583,6 +6607,8 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) { ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kHot); ASSERT_EQ(size, 0); + get_iostats_context()->Reset(); + IOStatsContext* iostats = get_iostats_context(); for (int i = 0; i < kTriggerNum; i++) { ASSERT_OK(Put("foo", "bar")); @@ -6600,6 +6626,17 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) { ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ("bar", Get("foo")); + + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); + ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); diff --git a/db/table_cache.cc b/db/table_cache.cc index 3acd80627..fe549329a 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -103,7 +103,7 @@ Status TableCache::GetTableReader( std::unique_ptr* table_reader, const SliceTransform* prefix_extractor, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { std::string fname = TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); std::unique_ptr file; @@ -132,7 +132,8 @@ Status TableCache::GetTableReader( new RandomAccessFileReader( std::move(file), fname, ioptions_.clock, io_tracer_, record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners)); + file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, + file_temperature)); s = ioptions_.table_factory->NewTableReader( ro, TableReaderOptions( @@ -154,15 +155,13 @@ void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) { cache_->Erase(key); } -Status TableCache::FindTable(const ReadOptions& ro, - const FileOptions& file_options, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, Cache::Handle** handle, - const SliceTransform* prefix_extractor, - const bool no_io, bool record_read_stats, - HistogramImpl* file_read_hist, bool skip_filters, - int level, bool prefetch_index_and_filter_in_cache, - size_t max_file_size_for_l0_meta_pin) { +Status TableCache::FindTable( + const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, + Cache::Handle** handle, const SliceTransform* prefix_extractor, + const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, + bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); @@ -186,7 +185,7 @@ Status TableCache::FindTable(const ReadOptions& ro, ro, file_options, internal_comparator, fd, false /* sequential mode */, record_read_stats, file_read_hist, &table_reader, prefix_extractor, skip_filters, level, prefetch_index_and_filter_in_cache, - max_file_size_for_l0_meta_pin); + max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.stats, NO_FILE_ERRORS); @@ -231,7 +230,7 @@ InternalIterator* TableCache::NewIterator( options.read_tier == kBlockCacheTier /* no_io */, !for_compaction /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin); + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { table_reader = GetTableReaderFromHandle(handle); } @@ -431,7 +430,7 @@ Status TableCache::Get(const ReadOptions& options, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin); + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { t = GetTableReaderFromHandle(handle); } @@ -531,10 +530,12 @@ Status TableCache::MultiGet(const ReadOptions& options, // found in the row cache and thus the range may now be empty if (s.ok() && !table_range.empty()) { if (t == nullptr) { - s = FindTable( - options, file_options_, internal_comparator, fd, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, level); + s = FindTable(options, file_options_, internal_comparator, fd, &handle, + prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + true /* record_read_stats */, file_read_hist, skip_filters, + level, true /* prefetch_index_and_filter_in_cache */, + 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); if (s.ok()) { t = GetTableReaderFromHandle(handle); diff --git a/db/table_cache.h b/db/table_cache.h index 2138eb4a2..e8144dcd0 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -140,7 +140,8 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, - size_t max_file_size_for_l0_meta_pin = 0); + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); @@ -206,7 +207,8 @@ class TableCache { const SliceTransform* prefix_extractor = nullptr, bool skip_filters = false, int level = -1, bool prefetch_index_and_filter_in_cache = true, - size_t max_file_size_for_l0_meta_pin = 0); + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Create a key prefix for looking up the row cache. The prefix is of the // format row_cache_id + fd_number + seq_no. Later, the user key can be diff --git a/db/version_builder.cc b/db/version_builder.cc index a25f444fc..c90187a18 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1017,7 +1017,8 @@ class VersionBuilder::Rep { &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, - prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin); + prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, + file_meta->temperature); if (file_meta->table_reader_handle != nullptr) { // Load table_reader file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 6378155e3..4ffcce25c 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -22,6 +22,46 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { +inline void IOStatsAddBytesByTemperature(Temperature file_temperature, + size_t value) { + if (file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, value); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, value); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, value); + break; + default: + break; + } +} + +inline void IOStatsAddCountByTemperature(Temperature file_temperature, + size_t value) { + if (file_temperature == Temperature::kUnknown) { + return; + } + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, value); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, value); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, value); + break; + default: + break; + } +} + IOStatus RandomAccessFileReader::Create( const std::shared_ptr& fs, const std::string& fname, const FileOptions& file_opts, @@ -182,6 +222,8 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, *result = Slice(res_scratch, io_s.ok() ? pos : 0); } IOSTATS_ADD(bytes_read, result->size()); + IOStatsAddBytesByTemperature(file_temperature_, result->size()); + IOStatsAddCountByTemperature(file_temperature_, 1); SetPerfLevel(prev_perf_level); } if (stats_ != nullptr && file_read_hist_ != nullptr) { @@ -347,6 +389,9 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, } #endif // ROCKSDB_LITE IOSTATS_ADD(bytes_read, read_reqs[i].result.size()); + IOStatsAddBytesByTemperature(file_temperature_, + read_reqs[i].result.size()); + IOStatsAddCountByTemperature(file_temperature_, 1); } SetPerfLevel(prev_perf_level); } diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 181f4dd02..7706ca4af 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -73,6 +73,7 @@ class RandomAccessFileReader { HistogramImpl* file_read_hist_; RateLimiter* rate_limiter_; std::vector> listeners_; + Temperature file_temperature_; public: explicit RandomAccessFileReader( @@ -82,7 +83,8 @@ class RandomAccessFileReader { Statistics* stats = nullptr, uint32_t hist_type = 0, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, - const std::vector>& listeners = {}) + const std::vector>& listeners = {}, + Temperature file_temperature = Temperature::kUnknown) : file_(std::move(raf), io_tracer, _file_name), file_name_(std::move(_file_name)), clock_(clock), @@ -90,7 +92,8 @@ class RandomAccessFileReader { hist_type_(hist_type), file_read_hist_(file_read_hist), rate_limiter_(rate_limiter), - listeners_() { + listeners_(), + file_temperature_(file_temperature) { #ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), [this](const std::shared_ptr& e) { diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index 0f6ab692e..85937187b 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -14,6 +14,32 @@ namespace ROCKSDB_NAMESPACE { +// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each +// item in Temperature class. +struct FileIOByTemperature { + // the number of bytes read to Temperature::kHot file + uint64_t hot_file_bytes_read; + // the number of bytes read to Temperature::kWarm file + uint64_t warm_file_bytes_read; + // the number of bytes read to Temperature::kCold file + uint64_t cold_file_bytes_read; + // total number of reads to Temperature::kHot file + uint64_t hot_file_read_count; + // total number of reads to Temperature::kWarm file + uint64_t warm_file_read_count; + // total number of reads to Temperature::kCold file + uint64_t cold_file_read_count; + // reset all the statistics to 0. + void Reset() { + hot_file_bytes_read = 0; + warm_file_bytes_read = 0; + cold_file_bytes_read = 0; + hot_file_read_count = 0; + warm_file_read_count = 0; + cold_file_read_count = 0; + } +}; + struct IOStatsContext { // reset all io-stats counter to zero void Reset(); @@ -48,6 +74,8 @@ struct IOStatsContext { uint64_t cpu_write_nanos; // CPU time spent in read() and pread() uint64_t cpu_read_nanos; + + FileIOByTemperature file_io_stats_by_temperature; }; // If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global, diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 23bf3a694..b86951b4b 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -39,6 +39,7 @@ void IOStatsContext::Reset() { logger_nanos = 0; cpu_write_nanos = 0; cpu_read_nanos = 0; + file_io_stats_by_temperature.Reset(); #endif //! NIOSTATS_CONTEXT } @@ -66,7 +67,12 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const { IOSTATS_CONTEXT_OUTPUT(logger_nanos); IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos); IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos); - + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count); + IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count); std::string str = ss.str(); str.erase(str.find_last_not_of(", ") + 1); return str; diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index b7a56adef..d1804067c 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -77,20 +77,19 @@ extern thread_local PerfContext perf_context; } // Increase metric value -#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ - if (perf_level >= PerfLevel::kEnableCount && \ - perf_context.per_level_perf_context_enabled && \ - perf_context.level_to_perf_context) { \ - if ((*(perf_context.level_to_perf_context)).find(level) != \ - (*(perf_context.level_to_perf_context)).end()) { \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - else { \ - PerfContextByLevel empty_context; \ - (*(perf_context.level_to_perf_context))[level] = empty_context; \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - } \ +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ + if (perf_level >= PerfLevel::kEnableCount && \ + perf_context.per_level_perf_context_enabled && \ + perf_context.level_to_perf_context) { \ + if ((*(perf_context.level_to_perf_context)).find(level) != \ + (*(perf_context.level_to_perf_context)).end()) { \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } else { \ + PerfContextByLevel empty_context; \ + (*(perf_context.level_to_perf_context))[level] = empty_context; \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } \ + } #endif