diff --git a/HISTORY.md b/HISTORY.md index 2e6a3f596..7d70c6433 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -83,6 +83,7 @@ * Improved the SstDumpTool to read the comparator from table properties and use it to read the SST File. * Extended the column family statistics in the info log so the total amount of garbage in the blob files and the blob file space amplification factor are also logged. Also exposed the blob file space amp via the `rocksdb.blob-stats` DB property. * Introduced the API rocksdb_create_dir_if_missing in c.h that calls underlying file system's CreateDirIfMissing API to create the directory. +* Added last level and non-last level read statistics: `LAST_LEVEL_READ_*`, `NON_LAST_LEVEL_READ_*`. ## 6.29.0 (01/21/2022) Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info. diff --git a/db/db_test2.cc b/db/db_test2.cc index 3a6bcb677..64d4eaf22 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -6847,6 +6847,58 @@ TEST_F(DBTest2, BottommostTemperatureUniversal) { size = GetSstSizeHelper(Temperature::kCold); ASSERT_GT(size, 0); } + +TEST_F(DBTest2, LastLevelStatistics) { + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + options.level0_file_num_compaction_trigger = 2; + options.level_compaction_dynamic_level_bytes = true; + options.statistics = CreateDBStatistics(); + Reopen(options); + + // generate 1 sst on level 0 + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0); + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0); + + // 2nd flush to trigger compaction + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + + auto pre_bytes = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES); + auto pre_count = + options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); + + // 3rd flush to generate 1 sst on level 0 + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + pre_bytes); + ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + pre_count); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); +} #endif // ROCKSDB_LITE // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. diff --git a/db/table_cache.cc b/db/table_cache.cc index 78e1eaad2..c86f60750 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -135,7 +135,7 @@ Status TableCache::GetTableReader( std::move(file), fname, ioptions_.clock, io_tracer_, record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, - file_temperature)); + file_temperature, level == ioptions_.num_levels - 1)); s = ioptions_.table_factory->NewTableReader( ro, TableReaderOptions( diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index dc74eb0fb..976f64ec6 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -22,85 +22,43 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { -inline void IOStatsAddBytesByTemperature(Temperature file_temperature, - size_t value) { - if (file_temperature == Temperature::kUnknown) { - return; - } - switch (file_temperature) { - case Temperature::kHot: - IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, value); - break; - case Temperature::kWarm: - IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, value); - break; - case Temperature::kCold: - IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, value); - break; - default: - break; - } -} - -inline void IOStatsAddCountByTemperature(Temperature file_temperature, - size_t value) { - if (file_temperature == Temperature::kUnknown) { - return; - } - switch (file_temperature) { - case Temperature::kHot: - IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, value); - break; - case Temperature::kWarm: - IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, value); - break; - case Temperature::kCold: - IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, value); - break; - default: - break; - } -} -inline void StatisticAddBytesByTemperature(Statistics* stats, - Temperature file_temperature, - size_t value) { - if (stats == nullptr || file_temperature == Temperature::kUnknown) { - return; - } - switch (file_temperature) { - case Temperature::kHot: - RecordTick(stats, HOT_FILE_READ_BYTES, value); - break; - case Temperature::kWarm: - RecordTick(stats, WARM_FILE_READ_BYTES, value); - break; - case Temperature::kCold: - RecordTick(stats, COLD_FILE_READ_BYTES, value); - break; - default: - break; +inline void RecordIOStats(Statistics* stats, Temperature file_temperature, + bool is_last_level, size_t size) { + IOSTATS_ADD(bytes_read, size); + // record for last/non-last level + if (is_last_level) { + RecordTick(stats, LAST_LEVEL_READ_BYTES, size); + RecordTick(stats, LAST_LEVEL_READ_COUNT, 1); + } else { + RecordTick(stats, NON_LAST_LEVEL_READ_BYTES, size); + RecordTick(stats, NON_LAST_LEVEL_READ_COUNT, 1); } -} -inline void StatisticAddCountByTemperature(Statistics* stats, - Temperature file_temperature, - size_t value) { - if (stats == nullptr || file_temperature == Temperature::kUnknown) { - return; - } - switch (file_temperature) { - case Temperature::kHot: - RecordTick(stats, HOT_FILE_READ_COUNT, value); - break; - case Temperature::kWarm: - RecordTick(stats, WARM_FILE_READ_COUNT, value); - break; - case Temperature::kCold: - RecordTick(stats, COLD_FILE_READ_COUNT, value); - break; - default: - break; + // record for temperature file + if (file_temperature != Temperature::kUnknown) { + switch (file_temperature) { + case Temperature::kHot: + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1); + RecordTick(stats, HOT_FILE_READ_BYTES, size); + RecordTick(stats, HOT_FILE_READ_COUNT, 1); + break; + case Temperature::kWarm: + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1); + RecordTick(stats, WARM_FILE_READ_BYTES, size); + RecordTick(stats, WARM_FILE_READ_COUNT, 1); + break; + case Temperature::kCold: + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size); + IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1); + RecordTick(stats, COLD_FILE_READ_BYTES, size); + RecordTick(stats, COLD_FILE_READ_COUNT, 1); + break; + default: + break; + } } } @@ -273,11 +231,7 @@ IOStatus RandomAccessFileReader::Read( } *result = Slice(res_scratch, io_s.ok() ? pos : 0); } - IOSTATS_ADD(bytes_read, result->size()); - IOStatsAddBytesByTemperature(file_temperature_, result->size()); - IOStatsAddCountByTemperature(file_temperature_, 1); - StatisticAddBytesByTemperature(stats_, file_temperature_, result->size()); - StatisticAddCountByTemperature(stats_, file_temperature_, 1); + RecordIOStats(stats_, file_temperature_, is_last_level_, result->size()); SetPerfLevel(prev_perf_level); } if (stats_ != nullptr && file_read_hist_ != nullptr) { @@ -450,13 +404,8 @@ IOStatus RandomAccessFileReader::MultiRead( } #endif // ROCKSDB_LITE - IOSTATS_ADD(bytes_read, read_reqs[i].result.size()); - IOStatsAddBytesByTemperature(file_temperature_, - read_reqs[i].result.size()); - IOStatsAddCountByTemperature(file_temperature_, 1); - StatisticAddBytesByTemperature(stats_, file_temperature_, - read_reqs[i].result.size()); - StatisticAddCountByTemperature(stats_, file_temperature_, 1); + RecordIOStats(stats_, file_temperature_, is_last_level_, + read_reqs[i].result.size()); } SetPerfLevel(prev_perf_level); } diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 7fcf7f1ba..8f1e179f4 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -89,7 +89,8 @@ class RandomAccessFileReader { HistogramImpl* file_read_hist_; RateLimiter* rate_limiter_; std::vector> listeners_; - Temperature file_temperature_; + const Temperature file_temperature_; + const bool is_last_level_; public: explicit RandomAccessFileReader( @@ -100,7 +101,8 @@ class RandomAccessFileReader { HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, const std::vector>& listeners = {}, - Temperature file_temperature = Temperature::kUnknown) + Temperature file_temperature = Temperature::kUnknown, + bool is_last_level = false) : file_(std::move(raf), io_tracer, _file_name), file_name_(std::move(_file_name)), clock_(clock), @@ -109,7 +111,8 @@ class RandomAccessFileReader { file_read_hist_(file_read_hist), rate_limiter_(rate_limiter), listeners_(), - file_temperature_(file_temperature) { + file_temperature_(file_temperature), + is_last_level_(is_last_level) { #ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), [this](const std::shared_ptr& e) { diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index aae4954a2..9d3e89009 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -425,6 +425,12 @@ enum Tickers : uint32_t { WARM_FILE_READ_COUNT, COLD_FILE_READ_COUNT, + // Last level and non-last level read statistics + LAST_LEVEL_READ_BYTES, + LAST_LEVEL_READ_COUNT, + NON_LAST_LEVEL_READ_BYTES, + NON_LAST_LEVEL_READ_COUNT, + TICKER_ENUM_MAX }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 517d18690..61113d801 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5045,6 +5045,14 @@ class TickerTypeJni { return -0x28; case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT: return -0x29; + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES: + return -0x2A; + case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT: + return -0x2B; + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES: + return -0x2C; + case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT: + return -0x2D; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5406,6 +5414,14 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT; case -0x29: return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT; + case -0x2A: + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES; + case -0x2B: + return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT; + case -0x2C: + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES; + case -0x2D: + return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT; case 0x5F: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 237b4cb88..4f8ad1e40 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -796,6 +796,14 @@ public enum TickerType { WARM_FILE_READ_COUNT((byte) -0x28), COLD_FILE_READ_COUNT((byte) -0x29), + /** + * (non-)last level read statistics + */ + LAST_LEVEL_READ_BYTES((byte) -0x2A), + LAST_LEVEL_READ_COUNT((byte) -0x2B), + NON_LAST_LEVEL_READ_BYTES((byte) -0x2C), + NON_LAST_LEVEL_READ_COUNT((byte) -0x2D), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc index 7633d536f..ffe8cda7e 100644 --- a/microbench/db_basic_bench.cc +++ b/microbench/db_basic_bench.cc @@ -128,7 +128,8 @@ static void SetupDB(benchmark::State& state, Options& options, DB** dpptr, state.SkipWithError(s.ToString().c_str()); return; } - std::string db_name = db_path + "/" + test_name + std::to_string(getpid()); + std::string db_name = + db_path + kFilePathSeparator + test_name + std::to_string(getpid()); DestroyDB(db_name, options); s = DB::Open(options, db_name, dpptr); @@ -785,6 +786,7 @@ void GenerateRandomKVs(std::vector* keys, } } +// TODO: move it to different files, as it's testing an internal API static void DataBlockSeek(benchmark::State& state) { Random rnd(301); Options options = Options(); @@ -1287,6 +1289,72 @@ BENCHMARK(PrefixSeek) ->Iterations(kPrefixSeekNum / 8) ->Apply(PrefixSeekArguments); +// TODO: move it to different files, as it's testing an internal API +static void RandomAccessFileReaderRead(benchmark::State& state) { + bool enable_statistics = state.range(0); + constexpr int kFileNum = 10; + auto env = Env::Default(); + auto fs = env->GetFileSystem(); + std::string db_path; + Status s = env->GetTestDirectory(&db_path); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + return; + } + + // Setup multiple `RandomAccessFileReader`s with different parameters to be + // used for test + Random rand(301); + std::string fname_base = + db_path + kFilePathSeparator + "random-access-file-reader-read"; + std::vector> readers; + auto statistics_share = CreateDBStatistics(); + Statistics* statistics = enable_statistics ? statistics_share.get() : nullptr; + for (int i = 0; i < kFileNum; i++) { + std::string fname = fname_base + ToString(i); + std::string content = rand.RandomString(kDefaultPageSize); + std::unique_ptr tgt_file; + env->NewWritableFile(fname, &tgt_file, EnvOptions()); + tgt_file->Append(content); + tgt_file->Close(); + + std::unique_ptr f; + fs->NewRandomAccessFile(fname, FileOptions(), &f, nullptr); + int rand_num = rand.Next() % 3; + auto temperature = rand_num == 0 ? Temperature::kUnknown + : rand_num == 1 ? Temperature::kWarm + : Temperature::kCold; + readers.emplace_back(new RandomAccessFileReader( + std::move(f), fname, env->GetSystemClock().get(), nullptr, statistics, + 0, nullptr, nullptr, {}, temperature, rand_num == 1)); + } + + IOOptions io_options; + std::unique_ptr scratch(new char[2048]); + Slice result; + uint64_t idx = 0; + for (auto _ : state) { + s = readers[idx++ % kFileNum]->Read(io_options, 0, kDefaultPageSize / 3, + &result, scratch.get(), nullptr, + Env::IO_TOTAL); + if (!s.ok()) { + state.SkipWithError(s.ToString().c_str()); + } + } + + // clean up + for (int i = 0; i < kFileNum; i++) { + std::string fname = fname_base + ToString(i); + env->DeleteFile(fname); // ignore return, okay to fail cleanup + } +} + +BENCHMARK(RandomAccessFileReaderRead) + ->Iterations(1000000) + ->Arg(0) + ->Arg(1) + ->ArgName("enable_statistics"); + } // namespace ROCKSDB_NAMESPACE BENCHMARK_MAIN(); diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index c7c076cc6..4f361a739 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -222,6 +222,10 @@ const std::vector> TickersNameMap = { {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"}, {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"}, {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"}, + {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"}, + {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"}, + {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"}, + {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"}, }; const std::vector> HistogramsNameMap = {