From 25a0d0ca307710f7e64b35ef5732d0944de05f1b Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Fri, 19 Jun 2020 16:16:57 -0700 Subject: [PATCH] Fix block checksum for >=4GB, refactor (#6978) Summary: Although RocksDB falls over in various other ways with KVs around 4GB or more, this change fixes how XXH32 and XXH64 were being called by the block checksum code to support >= 4GB in case that should ever happen, or the code copied for other uses. This change is not a schema compatibility issue because the checksum verification code would checksum the first (block_size + 1) mod 2^32 bytes while the checksum construction code would checksum the first block_size mod 2^32 plus the compression type byte, meaning the XXH32/64 checksums for >=4GB block would not match about 255/256 times. While touching this code, I refactored to consolidate redundant implementations, improving diagnostics and performance tracking in some cases. Also used less confusing language in those diagnostics. Makes https://github.com/facebook/rocksdb/issues/6875 obsolete. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6978 Test Plan: I was able to write a test for this using an SST file writer and VerifyChecksum in a reader. The test fails before the fix, though I'm leaving the test disabled because I don't think it's worth the expense of running regularly. Reviewed By: gg814 Differential Revision: D22143260 Pulled By: pdillinger fbshipit-source-id: 982993d16134e8c50bea2269047f901c1783726e --- db/external_sst_file_test.cc | 36 +++++++++++++++++ file/random_access_file_reader.h | 2 +- .../block_based/block_based_table_builder.cc | 35 +++++++++-------- table/block_based/block_based_table_reader.cc | 17 ++++---- table/block_based/reader_common.cc | 39 +++++++++++++------ table/block_based/reader_common.h | 8 +++- table/block_fetcher.cc | 39 ++----------------- table/format.h | 2 +- 8 files changed, 102 insertions(+), 76 deletions(-) diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 683df1ba2..f698d501e 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -12,6 +12,7 @@ #include "file/filename.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/sst_file_reader.h" #include "rocksdb/sst_file_writer.h" #include "test_util/fault_injection_test_env.h" #include "test_util/testutil.h" @@ -2393,6 +2394,41 @@ TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) { ASSERT_EQ(1, num_compression_dicts); } +// Very slow, not worth the cost to run regularly +TEST_F(ExternalSSTFileTest, DISABLED_HugeBlockChecksum) { + int max_checksum = static_cast(kxxHash64); + for (int i = 0; i <= max_checksum; ++i) { + BlockBasedTableOptions table_options; + table_options.checksum = static_cast(i); + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // 2^32 - 1, will lead to data block with more than 2^32 bytes + size_t huge_size = port::kMaxUint32; + + std::string f = sst_files_dir_ + "f.sst"; + ASSERT_OK(sst_file_writer.Open(f)); + { + Random64 r(123); + std::string huge(huge_size, 0); + for (size_t j = 0; j + 7 < huge_size; j += 8) { + EncodeFixed64(&huge[j], r.Next()); + } + ASSERT_OK(sst_file_writer.Put("Huge", huge)); + } + + ExternalSstFileInfo f_info; + ASSERT_OK(sst_file_writer.Finish(&f_info)); + ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10); + + SstFileReader sst_file_reader(options); + ASSERT_OK(sst_file_reader.Open(f)); + ASSERT_OK(sst_file_reader.VerifyChecksum()); + } +} + TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { std::unique_ptr fault_injection_env( new FaultInjectionTestEnv(env_)); diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 4fee67c92..f1c9e7018 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -132,7 +132,7 @@ class RandomAccessFileReader { FSRandomAccessFile* file() { return file_.get(); } - std::string file_name() const { return file_name_; } + const std::string& file_name() const { return file_name_; } bool use_direct_io() const { return file_->use_direct_io(); } diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 377077730..30fc9197e 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -1093,42 +1093,43 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, if (io_s.ok()) { char trailer[kBlockTrailerSize]; trailer[0] = type; - char* trailer_without_type = trailer + 1; + uint32_t checksum = 0; switch (r->table_options.checksum) { case kNoChecksum: - EncodeFixed32(trailer_without_type, 0); break; case kCRC32c: { - auto crc = crc32c::Value(block_contents.data(), block_contents.size()); - crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type - EncodeFixed32(trailer_without_type, crc32c::Mask(crc)); + uint32_t crc = + crc32c::Value(block_contents.data(), block_contents.size()); + // Extend to cover compression type + crc = crc32c::Extend(crc, trailer, 1); + checksum = crc32c::Mask(crc); break; } case kxxHash: { XXH32_state_t* const state = XXH32_createState(); XXH32_reset(state, 0); - XXH32_update(state, block_contents.data(), - static_cast(block_contents.size())); - XXH32_update(state, trailer, 1); // Extend to cover block type - EncodeFixed32(trailer_without_type, XXH32_digest(state)); + XXH32_update(state, block_contents.data(), block_contents.size()); + // Extend to cover compression type + XXH32_update(state, trailer, 1); + checksum = XXH32_digest(state); XXH32_freeState(state); break; } case kxxHash64: { XXH64_state_t* const state = XXH64_createState(); XXH64_reset(state, 0); - XXH64_update(state, block_contents.data(), - static_cast(block_contents.size())); - XXH64_update(state, trailer, 1); // Extend to cover block type - EncodeFixed32( - trailer_without_type, - static_cast(XXH64_digest(state) & // lower 32 bits - uint64_t{0xffffffff})); + XXH64_update(state, block_contents.data(), block_contents.size()); + // Extend to cover compression type + XXH64_update(state, trailer, 1); + checksum = Lower32of64(XXH64_digest(state)); XXH64_freeState(state); break; } + default: + assert(false); + break; } - + EncodeFixed32(trailer + 1, checksum); assert(io_s.ok()); TEST_SYNC_POINT_CALLBACK( "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum", diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 43d23f170..a6f8bdcaa 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -776,9 +776,9 @@ Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( EncodeFixed64( tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); } - uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); - s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(), - tmp_buf.get(), block_size + 1, value); + s = ROCKSDB_NAMESPACE::VerifyBlockChecksum( + rep_->footer.checksum(), tmp_buf.get(), block_size, + rep_->file->file_name(), props_block_handle.offset()); } return s; } @@ -1728,15 +1728,14 @@ void BlockBasedTable::RetrieveMultipleBlocks( if (options.verify_checksums) { PERF_TIMER_GUARD(block_checksum_time); const char* data = req.result.data(); - uint32_t expected = - DecodeFixed32(data + req_offset + handle.size() + 1); - // Since the scratch might be shared. the offset of the data block in + // Since the scratch might be shared, the offset of the data block in // the buffer might not be 0. req.result.data() only point to the // begin address of each read request, we need to add the offset // in each read request. Checksum is stored in the block trailer, - // which is handle.size() + 1. - s = ROCKSDB_NAMESPACE::VerifyChecksum( - footer.checksum(), data + req_offset, handle.size() + 1, expected); + // beyond the payload size. + s = ROCKSDB_NAMESPACE::VerifyBlockChecksum( + footer.checksum(), data + req_offset, handle.size(), + rep_->file->file_name(), handle.offset()); TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); } } else if (!use_shared_buffer) { diff --git a/table/block_based/reader_common.cc b/table/block_based/reader_common.cc index 4164e4ce1..bb2376afa 100644 --- a/table/block_based/reader_common.cc +++ b/table/block_based/reader_common.cc @@ -8,7 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/reader_common.h" +#include "monitoring/perf_context_imp.h" +#include "util/coding.h" #include "util/crc32c.h" +#include "util/hash.h" +#include "util/string_util.h" #include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { @@ -18,29 +22,42 @@ void ForceReleaseCachedEntry(void* arg, void* h) { cache->Release(handle, true /* force_erase */); } -Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, - uint32_t expected) { +Status VerifyBlockChecksum(ChecksumType type, const char* data, + size_t block_size, const std::string& file_name, + uint64_t offset) { + PERF_TIMER_GUARD(block_checksum_time); + // After block_size bytes is compression type (1 byte), which is part of + // the checksummed section. + size_t len = block_size + 1; + // And then the stored checksum value (4 bytes). + uint32_t stored = DecodeFixed32(data + len); + Status s; - uint32_t actual = 0; + uint32_t computed = 0; switch (type) { case kNoChecksum: break; case kCRC32c: - expected = crc32c::Unmask(expected); - actual = crc32c::Value(buf, len); + stored = crc32c::Unmask(stored); + computed = crc32c::Value(data, len); break; case kxxHash: - actual = XXH32(buf, static_cast(len), 0); + computed = XXH32(data, len, 0); break; case kxxHash64: - actual = static_cast(XXH64(buf, static_cast(len), 0) & - uint64_t{0xffffffff}); + computed = Lower32of64(XXH64(data, len, 0)); break; default: - s = Status::Corruption("unknown checksum type"); + s = Status::Corruption( + "unknown checksum type " + ToString(type) + " from footer of " + + file_name + ", while checking block at offset " + ToString(offset) + + " size " + ToString(block_size)); } - if (s.ok() && actual != expected) { - s = Status::Corruption("properties block checksum mismatched"); + if (s.ok() && stored != computed) { + s = Status::Corruption( + "block checksum mismatch: stored = " + ToString(stored) + + ", computed = " + ToString(computed) + " in " + file_name + + " offset " + ToString(offset) + " size " + ToString(block_size)); } return s; } diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h index 8fa68d49c..6bd4b532f 100644 --- a/table/block_based/reader_common.h +++ b/table/block_based/reader_common.h @@ -28,6 +28,10 @@ inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( : nullptr; } -extern Status VerifyChecksum(const ChecksumType type, const char* buf, - size_t len, uint32_t expected); +// Assumes block has a trailer as in format.h. file_name and offset provided +// for generating a diagnostic message in returned status. +extern Status VerifyBlockChecksum(ChecksumType type, const char* data, + size_t block_size, + const std::string& file_name, + uint64_t offset); } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index bcb5c4340..627af7f23 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -19,51 +19,20 @@ #include "rocksdb/env.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" +#include "table/block_based/reader_common.h" #include "table/format.h" #include "table/persistent_cache_helper.h" -#include "util/coding.h" #include "util/compression.h" -#include "util/crc32c.h" #include "util/stop_watch.h" -#include "util/string_util.h" -#include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { inline void BlockFetcher::CheckBlockChecksum() { // Check the crc of the type and the block contents if (read_options_.verify_checksums) { - const char* data = slice_.data(); // Pointer to where Read put the data - PERF_TIMER_GUARD(block_checksum_time); - uint32_t value = DecodeFixed32(data + block_size_ + 1); - uint32_t actual = 0; - switch (footer_.checksum()) { - case kNoChecksum: - break; - case kCRC32c: - value = crc32c::Unmask(value); - actual = crc32c::Value(data, block_size_ + 1); - break; - case kxxHash: - actual = XXH32(data, static_cast(block_size_) + 1, 0); - break; - case kxxHash64: - actual = static_cast( - XXH64(data, static_cast(block_size_) + 1, 0) & - uint64_t{0xffffffff}); - break; - default: - status_ = Status::Corruption( - "unknown checksum type " + ToString(footer_.checksum()) + " in " + - file_->file_name() + " offset " + ToString(handle_.offset()) + - " size " + ToString(block_size_)); - } - if (status_.ok() && actual != value) { - status_ = Status::Corruption( - "block checksum mismatch: expected " + ToString(actual) + ", got " + - ToString(value) + " in " + file_->file_name() + " offset " + - ToString(handle_.offset()) + " size " + ToString(block_size_)); - } + status_ = ROCKSDB_NAMESPACE::VerifyBlockChecksum( + footer_.checksum(), slice_.data(), block_size_, file_->file_name(), + handle_.offset()); } } diff --git a/table/format.h b/table/format.h index 18f7e0bf5..725435900 100644 --- a/table/format.h +++ b/table/format.h @@ -220,7 +220,7 @@ Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number = 0); -// 1-byte type + 32-bit crc +// 1-byte compression type + 32-bit checksum static const size_t kBlockTrailerSize = 5; // Make block size calculation for IO less error prone