From 38d0a365e3d57003b11d34a92a65299a13b46699 Mon Sep 17 00:00:00 2001 From: Akanksha Mahajan Date: Wed, 7 Oct 2020 13:27:03 -0700 Subject: [PATCH] Add Stats for MultiGet (#7366) Summary: Add following stats for MultiGet in Histogram to get more insight on MultiGet. 1. Number of index and filter blocks read from file as part of MultiGet request per level. 2. Number of data blocks read from file per level. 3. Number of SST files loaded from file system per level. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7366 Reviewed By: anand1976 Differential Revision: D24127040 Pulled By: akankshamahajan15 fbshipit-source-id: e63a003056b833729b277edc0639c08fb432756b --- HISTORY.md | 1 + db/db_basic_test.cc | 89 +++++++++++++++++++ db/version_set.cc | 26 ++++++ include/rocksdb/statistics.h | 8 ++ java/rocksjni/portal.h | 14 +++ .../main/java/org/rocksdb/HistogramType.java | 16 ++++ monitoring/statistics.cc | 4 + table/block_based/block_based_table_reader.cc | 35 ++++++++ table/get_context.h | 5 ++ 9 files changed, 198 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 823dffaba..a3c83f182 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ * Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit). This change will allow for better and more thorough configuration management and retrieval in the future. The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method. The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods. The list of options supported by an object can be obtained via the GetOptionNames method. The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method. Configurable options can be compared via the AreEquivalent method. The settings within a Configurable object may be validated via the ValidateOptions method. The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method. * Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail. * Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface. +* Add more stats for MultiGet in Histogram to get number of data blocks, index blocks, filter blocks and sst files read from file system per level. ## 6.13 (09/12/2020) ### Bug fixes diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index bc1027a8f..65de32de3 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -1871,6 +1871,95 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) { } } +TEST_F(DBBasicTest, MultiGetStats) { + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_size = 1; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.partition_filters = true; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + int total_keys = 2000; + std::vector keys_str(total_keys); + std::vector keys(total_keys); + std::vector values(total_keys); + std::vector s(total_keys); + ReadOptions read_opts; + + Random rnd(309); + // Create Multiple SST files at multiple levels. + for (int i = 0; i < 500; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + Flush(1); + } + } + Flush(1); + MoveFilesToLevel(2, 1); + + for (int i = 501; i < 1000; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + Flush(1); + } + } + + Flush(1); + MoveFilesToLevel(2, 1); + + for (int i = 1001; i < total_keys; ++i) { + keys_str[i] = "k" + std::to_string(i); + keys[i] = Slice(keys_str[i]); + ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); + if (i % 100 == 0) { + Flush(1); + } + } + Flush(1); + Close(); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(options.statistics->Reset()); + + db_->MultiGet(read_opts, handles_[1], total_keys, keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), total_keys); + HistogramData hist_data_blocks; + HistogramData hist_index_and_filter_blocks; + HistogramData hist_sst; + + options.statistics->histogramData(NUM_DATA_BLOCKS_READ_PER_LEVEL, + &hist_data_blocks); + options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + &hist_index_and_filter_blocks); + options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst); + + // Maximum number of blocks read from a file system in a level. + ASSERT_GT(hist_data_blocks.max, 0); + ASSERT_GT(hist_index_and_filter_blocks.max, 0); + // Maximum number of sst files read from file system in a level. + ASSERT_GT(hist_sst.max, 0); + + // Minimun number of blocks read in a level. + ASSERT_EQ(hist_data_blocks.min, 0); + ASSERT_GT(hist_index_and_filter_blocks.min, 0); + // Minimun number of sst files read in a level. + ASSERT_GT(hist_sst.max, 0); +} + // Test class for batched MultiGet with prefix extractor // Param bool - If true, use partitioned filters // If false, use full filter block diff --git a/db/version_set.cc b/db/version_set.cc index 1fadd929e..e708981ec 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1968,6 +1968,10 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, &storage_info_.file_indexer_, user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); Status s; + uint64_t num_index_read = 0; + uint64_t num_filter_read = 0; + uint64_t num_data_read = 0; + uint64_t num_sst_read = 0; while (f != nullptr) { MultiGetRange file_range = fp.CurrentFileRange(); @@ -2014,6 +2018,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, sample_file_read_inc(f->file_metadata); } batch_size++; + num_index_read += get_context.get_context_stats_.num_index_read; + num_filter_read += get_context.get_context_stats_.num_filter_read; + num_data_read += get_context.get_context_stats_.num_data_read; + num_sst_read += get_context.get_context_stats_.num_sst_read; + // report the counters before returning if (get_context.State() != GetContext::kNotFound && get_context.State() != GetContext::kMerge && @@ -2069,6 +2078,23 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, continue; } } + + // Report MultiGet stats per level. + if (fp.IsHitFileLastInLevel()) { + // Dump the stats if this is the last file of this level and reset for + // next level. + RecordInHistogram(db_statistics_, + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + num_index_read + num_filter_read); + RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL, + num_data_read); + RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read); + num_filter_read = 0; + num_index_read = 0; + num_data_read = 0; + num_sst_read = 0; + } + RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size); if (!s.ok() || file_picker_range.empty()) { break; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 74ee6bd29..98b4fb970 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -464,6 +464,14 @@ enum Histograms : uint32_t { FLUSH_TIME, SST_BATCH_SIZE, + // MultiGet stats logged per level + // Num of index and filter blocks read from file system per level. + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + // Num of data blocks read from file system per level. + NUM_DATA_BLOCKS_READ_PER_LEVEL, + // Num of sst files read from file system per level. + NUM_SST_READ_PER_LEVEL, + HISTOGRAM_ENUM_MAX, }; diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 7a54e387f..e31da0f4f 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5385,6 +5385,13 @@ class HistogramTypeJni { return 0x2D; case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS: return 0x2E; + case ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL: + return 0x2F; + case ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL: + return 0x30; + case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: + return 0x31; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5492,6 +5499,13 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS; case 0x2E: return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS; + case 0x2F: + return ROCKSDB_NAMESPACE::Histograms:: + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL; + case 0x30: + return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL; + case 0x31: + return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 696ee75f2..80d7c600e 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -159,6 +159,22 @@ public enum HistogramType { */ BLOB_DB_DECOMPRESSION_MICROS((byte) 0x2E), + /** + * Num of Index and Filter blocks read from file system per level in MultiGet + * request + */ + NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x2F), + + /** + * Num of Data blocks read from file system per level in MultiGet request. + */ + NUM_DATA_BLOCKS_READ_PER_LEVEL((byte) 0x30), + + /** + * Num of SST files read from file system per level in MultiGet request. + */ + NUM_SST_READ_PER_LEVEL((byte) 0x31), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index b0f8bb30e..2c7f65a89 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -243,6 +243,10 @@ const std::vector> HistogramsNameMap = { {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"}, {FLUSH_TIME, "rocksdb.db.flush.micros"}, {SST_BATCH_SIZE, "rocksdb.sst.batch.size"}, + {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, + "rocksdb.num.index.and.filter.blocks.read.per.level"}, + {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"}, + {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, }; std::shared_ptr CreateDBStatistics() { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index dc4f6139c..803740632 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1482,6 +1482,21 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( s = block_fetcher.ReadBlockContents(); raw_block_comp_type = block_fetcher.get_compression_type(); contents = &raw_block_contents; + if (get_context) { + switch (block_type) { + case BlockType::kIndex: + ++get_context->get_context_stats_.num_index_read; + break; + case BlockType::kFilter: + ++get_context->get_context_stats_.num_filter_read; + break; + case BlockType::kData: + ++get_context->get_context_stats_.num_data_read; + break; + default: + break; + } + } } else { raw_block_comp_type = contents->get_compression_type(); } @@ -1889,6 +1904,22 @@ Status BlockBasedTable::RetrieveBlock( GetMemoryAllocator(rep_->table_options), for_compaction, rep_->blocks_definitely_zstd_compressed, rep_->table_options.filter_policy.get()); + + if (get_context) { + switch (block_type) { + case BlockType::kIndex: + ++(get_context->get_context_stats_.num_index_read); + break; + case BlockType::kFilter: + ++(get_context->get_context_stats_.num_filter_read); + break; + case BlockType::kData: + ++(get_context->get_context_stats_.num_data_read); + break; + default: + break; + } + } } if (!s.ok()) { @@ -2553,6 +2584,10 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, } RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles, &statuses, &results, scratch, dict); + if (sst_file_range.begin()->get_context) { + ++(sst_file_range.begin() + ->get_context->get_context_stats_.num_sst_read); + } } } diff --git a/table/get_context.h b/table/get_context.h index 4da7829ec..c349a3e6f 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -47,6 +47,11 @@ struct GetContextStats { uint64_t num_cache_compression_dict_add = 0; uint64_t num_cache_compression_dict_add_redundant = 0; uint64_t num_cache_compression_dict_bytes_insert = 0; + // MultiGet stats. + uint64_t num_filter_read = 0; + uint64_t num_index_read = 0; + uint64_t num_data_read = 0; + uint64_t num_sst_read = 0; }; // A class to hold context about a point lookup, such as pointer to value