diff --git a/HISTORY.md b/HISTORY.md index 94a70bd7d..408a4ce81 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ * Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache. * Improve the operational safety of publishing a DB or SST files to many hosts by using different block cache hash seeds on different hosts. The exact behavior is controlled by new option `ShardedCacheOptions::hash_seed`, which also documents the solved problem in more detail. * Introduced a new option `CompactionOptionsFIFO::file_temperature_age_thresholds` that allows FIFO compaction to compact files to different temperatures based on key age (#11428). +* Added a new ticker stat to count how many times RocksDB detected a corruption while verifying a block checksum: `BLOCK_CHECKSUM_MISMATCH_COUNT`. ### Public API Changes * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index adc2b36bb..8b291acda 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -242,6 +242,46 @@ TEST_F(DBStatisticsTest, VerifyChecksumReadStat) { } } +TEST_F(DBStatisticsTest, BlockChecksumStats) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + // Scenario 0: only WAL data. Not verified so require ticker to be zero. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_COMPUTE_COUNT)); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); + + // Scenario 1: Flushed table verified in `VerifyChecksum()`. This opens a + // `TableReader` to verify each of the four blocks (meta-index, table + // properties, index, and data block). + ASSERT_OK(Flush()); + ASSERT_OK(options.statistics->Reset()); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_EQ(4, + options.statistics->getTickerCount(BLOCK_CHECKSUM_COMPUTE_COUNT)); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); + + // Scenario 2: Corrupted table verified in `VerifyChecksum()`. The corruption + // is in the fourth and final verified block, i.e., the data block. + std::unordered_map table_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &table_files)); + ASSERT_EQ(1, table_files.size()); + std::string table_name = table_files.begin()->first; + // Assumes the data block starts at offset zero. + ASSERT_OK(test::CorruptFile(options.env, table_name, 0 /* offset */, + 3 /* bytes_to_corrupt */)); + ASSERT_OK(options.statistics->Reset()); + ASSERT_NOK(db_->VerifyChecksum()); + ASSERT_EQ(4, + options.statistics->getTickerCount(BLOCK_CHECKSUM_COMPUTE_COUNT)); + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); +} } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 56686d438..99791e47a 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -394,7 +394,13 @@ enum Tickers : uint32_t { NON_LAST_LEVEL_READ_BYTES, NON_LAST_LEVEL_READ_COUNT, + // Number of block checksum verifications BLOCK_CHECKSUM_COMPUTE_COUNT, + // Number of times RocksDB detected a corruption while verifying a block + // checksum. RocksDB does not remember corruptions that happened during user + // reads so the same block corruption may be detected multiple times. + BLOCK_CHECKSUM_MISMATCH_COUNT, + MULTIGET_COROUTINE_COUNT, // Integrated BlobDB specific stats diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 32dcca9df..867477f1a 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5129,6 +5129,8 @@ class TickerTypeJni { return -0x3A; case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: return -0x3B; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT: + return -0x3C; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5490,6 +5492,8 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; case -0x3B: return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; + case -0x3C: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT; case 0x5F: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 98e3043c6..f100bb277 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -708,6 +708,9 @@ public enum TickerType { NON_LAST_LEVEL_READ_BYTES((byte) -0x2C), NON_LAST_LEVEL_READ_COUNT((byte) -0x2D), + /** + * Number of block checksum verifications + */ BLOCK_CHECKSUM_COMPUTE_COUNT((byte) -0x2E), /** @@ -754,6 +757,13 @@ public enum TickerType { */ TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x3B), + /** + * Number of times RocksDB detected a corruption while verifying a block + * checksum. RocksDB does not remember corruptions that happened during user + * reads so the same block corruption may be detected multiple times. + */ + BLOCK_CHECKSUM_MISMATCH_COUNT((byte) -0x3C), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 6d160484e..927cf9895 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -205,6 +205,7 @@ const std::vector> TickersNameMap = { {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"}, {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"}, {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}, + {BLOCK_CHECKSUM_MISMATCH_COUNT, "rocksdb.block.checksum.mismatch.count"}, {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"}, {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"}, {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"}, diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index a6ee940d8..90a938285 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -500,6 +500,7 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { std::unique_ptr table; Options options; + options.statistics = CreateDBStatistics(); ImmutableOptions ioptions(options); FileOptions foptions; foptions.use_direct_reads = use_direct_reads_; @@ -529,8 +530,12 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { static_cast(handle.offset()), 128)); NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); Status s = table->VerifyChecksum(ReadOptions(), TableReaderCaller::kUserVerifyChecksum); + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); ASSERT_EQ(s.code(), Status::kCorruption); } diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h index 2cf7cdb6d..102802afe 100644 --- a/table/block_based/reader_common.h +++ b/table/block_based/reader_common.h @@ -24,6 +24,9 @@ inline MemoryAllocator* GetMemoryAllocator( // Assumes block has a trailer as in format.h. file_name and offset provided // for generating a diagnostic message in returned status. +// +// Returns Status::OK() on checksum match, or Status::Corruption() on checksum +// mismatch. extern Status VerifyBlockChecksum(ChecksumType type, const char* data, size_t block_size, const std::string& file_name, diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 14a5ea456..1fdf86168 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -37,6 +37,10 @@ inline void BlockFetcher::ProcessTrailerIfPresent() { footer_.checksum_type(), slice_.data(), block_size_, file_->file_name(), handle_.offset())); RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); + if (!io_status_.ok()) { + assert(io_status_.IsCorruption()); + RecordTick(ioptions_.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); + } } compression_type_ = BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);